# 8 Tips for Success - Starting Your Data Science Project
By James Wilson

In [41]:
## SET UP 
# -*- coding: utf-8 -*-
import pandas as pd

# Load Data Sources

In [42]:
# Jeopardy Archive Data
# Update your working directory (where the file is located)
archive = pd.read_csv("C:/Users/jwilson2/Desktop/GitHub/UCLA-Intro-Python-Lecture/data/jeopardy_archive_data_raw.csv")
archive.head()

Unnamed: 0,Nickname,Player Details,Final Score,Show Info
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$27,600","J! Archive - Show #8045, aired 2019-07-26"
1,Maggie,"Maggie Lehrman, an editor and writer from Broo...",$0,"J! Archive - Show #8045, aired 2019-07-26"
2,Michael,"Michael Riggs, an educational therapist from T...",$2,"J! Archive - Show #8045, aired 2019-07-26"
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$4,400","J! Archive - Show #8044, aired 2019-07-25"
4,Eric,"Eric Kaplan, a retired OB-GYN physician from L...",$0,"J! Archive - Show #8044, aired 2019-07-25"


In [43]:
# @CoolJeopardyStories Twitter Data
# Update your working directory (where the file is located)
tweets = pd.read_csv('C:/Users/jwilson2/Desktop/GitHub/UCLA-Intro-Python-Lecture/data/CoolJepStories_tweets.csv')
tweets.head()

Unnamed: 0,id,created_at,favorite_count,retweet_count,text
0,1194098342285991938,2019-11-12 03:43:24,78,13,The coolest Jeopardy! story.\n#WeLoveYouAlex h...
1,1154934362737205253,2019-07-27 01:59:44,23,2,"7/26/19:\n""My medical school interview was wit..."
2,1154749896022548481,2019-07-26 13:46:44,12,1,If you are interested in tipping a hat of grat...
3,1154749838606688257,2019-07-26 13:46:30,165,8,Today is the final day @CoolJepStories posts a...
4,1154567652465106944,2019-07-26 01:42:33,6,1,"7/25/19:\n""I like cooking with my 2-year-old s..."


# Tip #1 - Evaluate Column Names and Types

In [44]:
archive.shape # row and column counts 

(6000, 4)

In [45]:
archive.info() # description of the data objects themselves and an overall picture

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 4 columns):
Nickname          6000 non-null object
Player Details    6000 non-null object
Final Score       6000 non-null object
Show Info         6000 non-null object
dtypes: object(4)
memory usage: 187.6+ KB


In [46]:
archive.dtypes # column types - important!! 

Nickname          object
Player Details    object
Final Score       object
Show Info         object
dtype: object

In [47]:
tweets.shape

(1620, 5)

In [48]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1620 entries, 0 to 1619
Data columns (total 5 columns):
id                1620 non-null int64
created_at        1620 non-null object
favorite_count    1620 non-null int64
retweet_count     1620 non-null int64
text              1620 non-null object
dtypes: int64(3), object(2)
memory usage: 63.4+ KB


In [49]:
tweets.dtypes

id                 int64
created_at        object
favorite_count     int64
retweet_count      int64
text              object
dtype: object

In [50]:
# Clean Columns 
archive.columns

Index(['Nickname', 'Player Details', 'Final Score', 'Show Info'], dtype='object')

In [51]:
# Map the lowering function to all column names
archive.columns = map(str.lower, archive.columns)
# replace spaces with '_'
archive.columns = archive.columns.str.replace(' ', '_')
# review
archive.columns

Index(['nickname', 'player_details', 'final_score', 'show_info'], dtype='object')

In [52]:
# Remove unneeded columns
tweets.columns

Index(['id', 'created_at', 'favorite_count', 'retweet_count', 'text'], dtype='object')

In [53]:
# Use drop and inplace to safetly drop specified columns
tweets.drop(['id','created_at'], axis=1, inplace=True)
tweets.columns

Index(['favorite_count', 'retweet_count', 'text'], dtype='object')

# Tip #2 - Detect NA's; why are they there?

In [54]:
archive.isna().mean().round(4)*100

nickname          0.0
player_details    0.0
final_score       0.0
show_info         0.0
dtype: float64

In [55]:
tweets.isna().mean().round(4)*100

favorite_count    0.0
retweet_count     0.0
text              0.0
dtype: float64

In [56]:
### Any NA's ? 

# Tip #3 - String Cleaning & Variable Creation

### Clean Final Score

In [57]:
print(archive["final_score"][0:10])

0    $27,600 
1         $0 
2         $2 
3     $4,400 
4         $0 
5         $0 
6    $30,000 
7     $3,000 
8     $6,100 
9    $12,400 
Name: final_score, dtype: object


In [58]:
archive["final_score"] = archive["final_score"].str.replace(',', '') # replace comma
archive["final_score"] = archive["final_score"].str.replace('$', '') # replace '$'
archive["final_score"] = archive["final_score"].astype(int) # convert to integer
print(archive["final_score"][0:10])

0    27600
1        0
2        2
3     4400
4        0
5        0
6    30000
7     3000
8     6100
9    12400
Name: final_score, dtype: int32


### Extract Player Details 

In [59]:
print(archive["player_details"][0])

Jason Zuffranieri, a math teacher from Albuquerque, New Mexico (whose 5-day cash winnings total $109,700)


In [60]:
print(archive["player_details"][10])

Peggy Robin, a publisher and chief moderator from Washington, D.C.


In [61]:
print(archive["player_details"][20])

Nathan Kaplan, a math professor from Los Angeles, California


In [62]:
for i in range(0,len(archive)):
    # extract player details 
    archive.at[i,"full_name"] = archive["player_details"].iloc[i].split(",")[0]
    archive.at[i,"occcupation"] = archive["player_details"].iloc[i].split(",")[1].split("from")[0].strip()
    archive.at[i,"hometown"] = archive["player_details"].iloc[i].split("from")[1].split("(")[0].strip()

### Extract Show Info

In [63]:
print(archive["show_info"][0])

J! Archive - Show #8045, aired 2019-07-26


In [64]:
print(archive["show_info"][12])

J! Archive - Show #8041, aired 2019-07-22


In [65]:
for i in range(0,len(archive)):
    # extract show info 
    archive.at[i,"archive_info"] = archive["show_info"].iloc[i].split(", aired")[0].strip()
    archive.at[i,"date"] = archive["show_info"].iloc[i].split(", aired")[1].strip()

### Total Running Score

### City and state values 

In [66]:
archive.loc[:,'city'] = ""
archive.loc[:,'city'] = archive.loc[:,'hometown'].str.split(',',expand=True)[0]
archive.loc[:,'city'] = archive.loc[:,'city'].str.strip()

archive.loc[:,'state'] = ""
archive.loc[:,'state'] = archive.loc[:,'hometown'].str.split(',',expand=True)[1]
archive.loc[:,'state'] = archive.loc[:,'state'].str.strip()

### Extract fun facts and show date from tweet

In [67]:
tweets["text"][0] # not a good tweet

'The coolest Jeopardy! story.\n#WeLoveYouAlex https://t.co/P02SrYHoO0'

In [68]:
tweets["text"][1] # good tweet~! 

'7/26/19:\n"My medical school interview was with a @Jeopardy! contestant who fell to @KenJennings."\n"The children\'s\' book industry is phenomenal."\n"A movie editor in France has a similar name to mine."\n#Jeopardy https://t.co/NUhCad6Nwh'

In [69]:
# Flag just relevant tweets 
tweets["game_info_flg"] = ""

#subset to just game events
for i in range(0,len(tweets)):
    tweets.at[i,'game_info_flg'] = tweets['text'][i][0].isnumeric() # flag where numeric start to text 

tweets_games = tweets[tweets["game_info_flg"] == True]
tweets_games.reset_index(drop=True, inplace=True)


In [70]:
# set variables 

for i in range(0,len(tweets_games)):
    twt_txt = tweets_games["text"].loc[i].split("\n") # split tweets at new line 
    
    # split and save as a new row 
    
    tweets_games.loc[i,"date"] = twt_txt[0]
    tweets_games.loc[i,"answer1"] = twt_txt[1]
    tweets_games.loc[i,"answer2"] = twt_txt[2]
    
    if(len(twt_txt) > 3): # nuance in data; need to check for third answer if string is long enough 
        tweets_games.loc[i,"answer3"] = twt_txt[3]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)


In [None]:
tweets_games["date"] = ""
tweets_games["answer1"] = ""
tweets_games["answer2"] = ""
tweets_games["answer3"] = ""

In [71]:
tweets_games.head()

Unnamed: 0,favorite_count,retweet_count,text,game_info_flg,date,answer1,answer2,answer3
0,23,2,"7/26/19:\n""My medical school interview was wit...",True,7/26/19:,"""My medical school interview was with a @Jeopa...","""The children's' book industry is phenomenal.""","""A movie editor in France has a similar name t..."
1,6,1,"7/25/19:\n""I like cooking with my 2-year-old s...",True,7/25/19:,"""I like cooking with my 2-year-old son.""","""Chuck Norris' aunt thought I looked justl ike...","""It took me seconds to 'win' a game of anti-ch..."
2,8,3,7/24/19:\n“I played drums in a rhythmic troupe...,True,7/24/19:,“I played drums in a rhythmic troupe.”,“I had to memorize the Gettysburg Address.”,"“In Mexico, I was mistaken for Nicolas Cage.”"
3,5,0,"7/23/19:\n""I like telling dad jokes.""\n""I run ...",True,7/23/19:,"""I like telling dad jokes.""","""I run an online forum to talk about the neigh...","""My stuffed manatee is a comfort animal in my ..."
4,6,0,"7/22/19:\n""I explain forensic science to kids ...",True,7/22/19:,"""I explain forensic science to kids using the ...","""I went to Spain &amp; ran into a lady who was...","""I travel around the world playing Sudoku."""


# Tip #4 - Date Conversions 

In [72]:
# Create correct show date times 
print(archive['date'])

0       2019-07-26
1       2019-07-26
2       2019-07-26
3       2019-07-25
4       2019-07-25
           ...    
5995    2010-12-21
5996    2010-12-21
5997    2010-12-20
5998    2010-12-20
5999    2010-12-20
Name: date, Length: 6000, dtype: object


In [85]:
pd.to_datetime(archive['date'])
print(archive['date'])

0      2019-07-26
1      2019-07-26
2      2019-07-26
3      2019-07-25
4      2019-07-25
          ...    
5995   2010-12-21
5996   2010-12-21
5997   2010-12-20
5998   2010-12-20
5999   2010-12-20
Name: date, Length: 6000, dtype: datetime64[ns]


In [86]:
print(tweets_games['date'])

0      2019-07-26
1      2019-07-25
2      2019-07-24
3      2019-07-23
4      2019-07-22
          ...    
1404   2014-02-17
1405   2014-02-14
1406   2014-02-13
1407   2014-02-12
1408   2014-02-11
Name: date, Length: 1409, dtype: datetime64[ns]


In [87]:
tweets_games['date'].replace(regex=True,inplace=True,to_replace=r':',value=r'')
pd.to_datetime(tweets_games['date'])
print(tweets_games['date'])

0      2019-07-26
1      2019-07-25
2      2019-07-24
3      2019-07-23
4      2019-07-22
          ...    
1404   2014-02-17
1405   2014-02-14
1406   2014-02-13
1407   2014-02-12
1408   2014-02-11
Name: date, Length: 1409, dtype: datetime64[ns]


# Tip #5 - Join Relevant Data Fields

### Join the data together on show date & etc. 

In [79]:
tweets_games.head()

Unnamed: 0,favorite_count,retweet_count,text,game_info_flg,date,answer1,answer2,answer3
0,23,2,"7/26/19:\n""My medical school interview was wit...",True,2019-07-26,"""My medical school interview was with a @Jeopa...","""The children's' book industry is phenomenal.""","""A movie editor in France has a similar name t..."
1,6,1,"7/25/19:\n""I like cooking with my 2-year-old s...",True,2019-07-25,"""I like cooking with my 2-year-old son.""","""Chuck Norris' aunt thought I looked justl ike...","""It took me seconds to 'win' a game of anti-ch..."
2,8,3,7/24/19:\n“I played drums in a rhythmic troupe...,True,2019-07-24,“I played drums in a rhythmic troupe.”,“I had to memorize the Gettysburg Address.”,"“In Mexico, I was mistaken for Nicolas Cage.”"
3,5,0,"7/23/19:\n""I like telling dad jokes.""\n""I run ...",True,2019-07-23,"""I like telling dad jokes.""","""I run an online forum to talk about the neigh...","""My stuffed manatee is a comfort animal in my ..."
4,6,0,"7/22/19:\n""I explain forensic science to kids ...",True,2019-07-22,"""I explain forensic science to kids using the ...","""I went to Spain &amp; ran into a lady who was...","""I travel around the world playing Sudoku."""


In [78]:
archive.head()

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occcupation,hometown,archive_info,date,city,state
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...",27600,"J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26,Albuquerque,New Mexico
1,Maggie,"Maggie Lehrman, an editor and writer from Broo...",0,"J! Archive - Show #8045, aired 2019-07-26",Maggie Lehrman,an editor and writer,"Brooklyn, New York",J! Archive - Show #8045,2019-07-26,Brooklyn,New York
2,Michael,"Michael Riggs, an educational therapist from T...",2,"J! Archive - Show #8045, aired 2019-07-26",Michael Riggs,an educational therapist,"Tustin, California",J! Archive - Show #8045,2019-07-26,Tustin,California
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...",4400,"J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25,Albuquerque,New Mexico
4,Eric,"Eric Kaplan, a retired OB-GYN physician from L...",0,"J! Archive - Show #8044, aired 2019-07-25",Eric Kaplan,a retired OB-GYN physician,"Long Beach, California",J! Archive - Show #8044,2019-07-25,Long Beach,California


In [None]:
# Reformat Data
# Melt twitter data 
tweets_df = pd.melt(game_tweets, id_vars =['id', 'created_at', 'favorite_count', 'retweet_count', 'text',
       'game_info_flg', 'date'], var_name="answer_number")


In [None]:
# Add "Answer" variable to twitter 
    #IMPORTANT - reverse order of answers for archive data to match how twitter account was organized
archive["answer_number"] = ""
archive['dt_indx'] = archive.groupby(['Date']).cumcount()+1
archive.loc[archive_df["dt_indx"]==1,"answer_number"] = "Answer3"
archive.loc[archive_df["dt_indx"]==2,"answer_number"] = "Answer2"
archive.loc[archive_df["dt_indx"]==3,"answer_number"] = "Answer1"

In [None]:
# Merge Data Files
jeopardy = pd.merge(archive,tweets_df, on=['date','answer_number'])
jeopardy.head(10)

# Tip #6 - Numeric summaries and outlier detection

In [34]:
# Explore money amounts 



In [35]:
# Understand 

# Tip #7 - Create frequency tables

In [36]:
# State Values per $$ 

In [None]:
state_counts = jeopardy_df.groupby('state')['Full Name'].agg(['count']).reset_index()
state_counts.sort_values(by=['count'], ascending=False,inplace=True)
state_counts.head(25)

# Tip #8 - Build graphics to explore ideas

In [37]:
# Simple State map by player count 

In [None]:
import plotly.express as px
fig = px.bar(state_counts, x='state', y='count',color='count',
             labels={'state':'State', 'count':"Count"}, height=400)
fig.show()

In [38]:
# Distribution of scores over time 

# You know your data!! Now create some cool insights! 