## 010_an_merge
### Merge and clean jeopardy data sources 
### James Wilson

In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

  from collections import Mapping, defaultdict


In [2]:
# Load Data
archive_df = pd.read_csv("../data/raw/jeopardy_archive_data.csv")
twitter_df = pd.read_csv("../data/raw/jeopardy_twitter_data.csv")

In [3]:
# Map the lowering function to all column names
archive_df.columns = map(str.lower, archive_df.columns)
twitter_df.columns = map(str.lower, twitter_df.columns)
# replace spaces with '_'
archive_df.columns = archive_df.columns.str.replace(' ', '_')
twitter_df.columns = twitter_df.columns.str.replace(' ', '_')

In [4]:
# review
archive_df.columns
archive_df.head(10)

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occupation,hometown,archive_info,date
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$27,600","J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26
1,Maggie,"Maggie Lehrman, an editor and writer from Broo...",$0,"J! Archive - Show #8045, aired 2019-07-26",Maggie Lehrman,an editor and writer,"Brooklyn, New York",J! Archive - Show #8045,2019-07-26
2,Michael,"Michael Riggs, an educational therapist from T...",$2,"J! Archive - Show #8045, aired 2019-07-26",Michael Riggs,an educational therapist,"Tustin, California",J! Archive - Show #8045,2019-07-26
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$4,400","J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25
4,Eric,"Eric Kaplan, a retired OB/GYN physician from L...",$0,"J! Archive - Show #8044, aired 2019-07-25",Eric Kaplan,a retired OB/GYN physician,"Long Beach, California",J! Archive - Show #8044,2019-07-25
5,Shari,"Shari Meyer, a high school English teacher fro...",$0,"J! Archive - Show #8044, aired 2019-07-25",Shari Meyer,a high school English teacher,"Somerville, Massachusetts",J! Archive - Show #8044,2019-07-25
6,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$30,000","J! Archive - Show #8043, aired 2019-07-24",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8043,2019-07-24
7,Michelle,"Michelle Bruck, an attorney from Levittown, Pe...","$3,000","J! Archive - Show #8043, aired 2019-07-24",Michelle Bruck,an attorney,"Levittown, Pennsylvania",J! Archive - Show #8043,2019-07-24
8,Corin,"Corin Purifoy, a fiber artist from Milwaukee, ...","$6,100","J! Archive - Show #8043, aired 2019-07-24",Corin Purifoy,a fiber artist,"Milwaukee, Wisconsin",J! Archive - Show #8043,2019-07-24
9,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$30,100","J! Archive - Show #8042, aired 2019-07-23",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8042,2019-07-23


In [5]:
# Reformat Data
# Melt twitter data 
twitter_df_melt = pd.melt(twitter_df, id_vars =['id', 'created_at', 'favorite_count', 'retweet_count', 'text',
       'game_info_flg', 'date'], var_name="answer_number")
#twitter_df_melt.sort_values(['date'],ascending=False).head(12)

In [6]:
# Add "Answer" variable to twitter 
    #IMPORTANT - reverse order of answers for archive data to match how twitter account was organized
archive_df["answer_number"] = ""
archive_df['dt_indx'] = archive_df.groupby(['date']).cumcount()+1
archive_df.loc[archive_df["dt_indx"]==1,"answer_number"] = "answer3"
archive_df.loc[archive_df["dt_indx"]==2,"answer_number"] = "answer2"
archive_df.loc[archive_df["dt_indx"]==3,"answer_number"] = "answer1"

In [7]:
archive_df.shape

(9000, 11)

In [8]:
twitter_df_melt.shape

(4227, 9)

In [9]:
# Merge Data Files
jeopardy = pd.merge(archive_df,twitter_df_melt, on=['date','answer_number'])
jeopardy.head(10)

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occupation,hometown,archive_info,date,answer_number,dt_indx,id,created_at,favorite_count,retweet_count,text,game_info_flg,value
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$27,600","J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26,answer3,1,1154934362737205253,2019-07-27 01:59:44,23,2,"7/26/19:\n""My medical school interview was wit...",True,"""A movie editor in France has a similar name t..."
1,Maggie,"Maggie Lehrman, an editor and writer from Broo...",$0,"J! Archive - Show #8045, aired 2019-07-26",Maggie Lehrman,an editor and writer,"Brooklyn, New York",J! Archive - Show #8045,2019-07-26,answer2,2,1154934362737205253,2019-07-27 01:59:44,23,2,"7/26/19:\n""My medical school interview was wit...",True,"""The children's' book industry is phenomenal."""
2,Michael,"Michael Riggs, an educational therapist from T...",$2,"J! Archive - Show #8045, aired 2019-07-26",Michael Riggs,an educational therapist,"Tustin, California",J! Archive - Show #8045,2019-07-26,answer1,3,1154934362737205253,2019-07-27 01:59:44,23,2,"7/26/19:\n""My medical school interview was wit...",True,"""My medical school interview was with a @Jeopa..."
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$4,400","J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25,answer3,1,1154567652465106944,2019-07-26 01:42:33,6,1,"7/25/19:\n""I like cooking with my 2-year-old s...",True,"""It took me seconds to 'win' a game of anti-ch..."
4,Eric,"Eric Kaplan, a retired OB/GYN physician from L...",$0,"J! Archive - Show #8044, aired 2019-07-25",Eric Kaplan,a retired OB/GYN physician,"Long Beach, California",J! Archive - Show #8044,2019-07-25,answer2,2,1154567652465106944,2019-07-26 01:42:33,6,1,"7/25/19:\n""I like cooking with my 2-year-old s...",True,"""Chuck Norris' aunt thought I looked justl ike..."
5,Shari,"Shari Meyer, a high school English teacher fro...",$0,"J! Archive - Show #8044, aired 2019-07-25",Shari Meyer,a high school English teacher,"Somerville, Massachusetts",J! Archive - Show #8044,2019-07-25,answer1,3,1154567652465106944,2019-07-26 01:42:33,6,1,"7/25/19:\n""I like cooking with my 2-year-old s...",True,"""I like cooking with my 2-year-old son."""
6,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$30,000","J! Archive - Show #8043, aired 2019-07-24",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8043,2019-07-24,answer3,1,1154498659637051393,2019-07-25 21:08:24,8,3,7/24/19:\n“I played drums in a rhythmic troupe...,True,"“In Mexico, I was mistaken for Nicolas Cage.”"
7,Michelle,"Michelle Bruck, an attorney from Levittown, Pe...","$3,000","J! Archive - Show #8043, aired 2019-07-24",Michelle Bruck,an attorney,"Levittown, Pennsylvania",J! Archive - Show #8043,2019-07-24,answer2,2,1154498659637051393,2019-07-25 21:08:24,8,3,7/24/19:\n“I played drums in a rhythmic troupe...,True,“I had to memorize the Gettysburg Address.”
8,Corin,"Corin Purifoy, a fiber artist from Milwaukee, ...","$6,100","J! Archive - Show #8043, aired 2019-07-24",Corin Purifoy,a fiber artist,"Milwaukee, Wisconsin",J! Archive - Show #8043,2019-07-24,answer1,3,1154498659637051393,2019-07-25 21:08:24,8,3,7/24/19:\n“I played drums in a rhythmic troupe...,True,“I played drums in a rhythmic troupe.”
9,Jason,"Jason Zuffranieri, a math teacher from Albuque...","$30,100","J! Archive - Show #8042, aired 2019-07-23",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8042,2019-07-23,answer3,1,1154153044772306945,2019-07-24 22:15:03,5,0,"7/23/19:\n""I like telling dad jokes.""\n""I run ...",True,"""My stuffed manatee is a comfort animal in my ..."


In [10]:
jeopardy.shape

(3768, 18)

In [11]:
import gender_guesser.detector as gender
gd = gender.Detector()
jeopardy['gender'] = jeopardy['nickname'].apply(gd.get_gender)

In [12]:
jeopardy.groupby(['gender'])['gender'].count()

gender
andy               37
female           1384
male             1658
mostly_female     205
mostly_male       217
unknown           267
Name: gender, dtype: int64

In [13]:
# Clean Data 
# clean money column
jeopardy['final_score'] = jeopardy['final_score'].str.replace(',', '')
jeopardy['final_score'] = jeopardy['final_score'].str.replace('$', '')
jeopardy['final_score'] = jeopardy['final_score'].astype(int)
        
# rename value column
jeopardy.rename(columns = {'value':'anecdote'}, inplace = True) 
jeopardy['anecdote'] = jeopardy['anecdote'].str.replace('"', '')

In [14]:
jeopardy.head()

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occupation,hometown,archive_info,date,answer_number,dt_indx,id,created_at,favorite_count,retweet_count,text,game_info_flg,anecdote,gender
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...",27600,"J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26,answer3,1,1154934362737205253,2019-07-27 01:59:44,23,2,"7/26/19:\n""My medical school interview was wit...",True,A movie editor in France has a similar name to...,male
1,Maggie,"Maggie Lehrman, an editor and writer from Broo...",0,"J! Archive - Show #8045, aired 2019-07-26",Maggie Lehrman,an editor and writer,"Brooklyn, New York",J! Archive - Show #8045,2019-07-26,answer2,2,1154934362737205253,2019-07-27 01:59:44,23,2,"7/26/19:\n""My medical school interview was wit...",True,The children's' book industry is phenomenal.,female
2,Michael,"Michael Riggs, an educational therapist from T...",2,"J! Archive - Show #8045, aired 2019-07-26",Michael Riggs,an educational therapist,"Tustin, California",J! Archive - Show #8045,2019-07-26,answer1,3,1154934362737205253,2019-07-27 01:59:44,23,2,"7/26/19:\n""My medical school interview was wit...",True,My medical school interview was with a @Jeopar...,male
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...",4400,"J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25,answer3,1,1154567652465106944,2019-07-26 01:42:33,6,1,"7/25/19:\n""I like cooking with my 2-year-old s...",True,It took me seconds to 'win' a game of anti-chess.,male
4,Eric,"Eric Kaplan, a retired OB/GYN physician from L...",0,"J! Archive - Show #8044, aired 2019-07-25",Eric Kaplan,a retired OB/GYN physician,"Long Beach, California",J! Archive - Show #8044,2019-07-25,answer2,2,1154567652465106944,2019-07-26 01:42:33,6,1,"7/25/19:\n""I like cooking with my 2-year-old s...",True,Chuck Norris' aunt thought I looked justl ike ...,male


In [15]:
jeopardy['clean_text'] = jeopardy.loc[:,'anecdote']

In [16]:
# clean general clean_text 
# replace "&amp;" with "and" 
jeopardy['clean_text'] = jeopardy['clean_text'].str.replace('&amp;','and')

In [17]:
# remove " " - REVIEW THIS STEP 
jeopardy.loc[:,'clean_text'] = jeopardy.loc[:,'clean_text'].str.replace('"','')
jeopardy.loc[:,'clean_text'] = jeopardy.loc[:,'clean_text'].str.replace("'",'')
jeopardy.loc[:,'clean_text'] = jeopardy.loc[:,'clean_text'].str.lstrip('\"')

# fix jeopardy exclamation mark
jeopardy.loc[:,'clean_text'] = jeopardy.loc[:,'clean_text'].str.replace('Jeopardy!','Jeopardy')
# general misspelling
jeopardy.loc[:,'clean_text'] = jeopardy.loc[:,'clean_text'].str.replace('justl ike','just like')

In [18]:
# rename individuals with @ signs 
# Function to clean the names 
def Clean_names(jprdy): 
    if re.search('@', jprdy): 
        sentence = re.sub( r"([A-Z])", r" \1", jprdy).split()
        jprdy = ' '.join(sentence)
        return jprdy
    else:
        return jprdy

# Updated the clean clean_text columns 
jeopardy.loc[:,'clean_text'] = jeopardy.loc[:,'clean_text'].apply(Clean_names) 

# remove '@'
jeopardy.loc[:,'clean_text'] = jeopardy.loc[:,'clean_text'].str.replace('@ ','')

In [19]:
# Clean hometown into city and state
jeopardy.loc[:,'city'] = ""
jeopardy.loc[:,'city'] = jeopardy.loc[:,'hometown'].str.split(',',expand=True)[0]
jeopardy.loc[:,'city'] = jeopardy.loc[:,'city'].str.strip()

jeopardy.loc[:,'state'] = ""
jeopardy.loc[:,'state'] = jeopardy.loc[:,'hometown'].str.split(',',expand=True)[1]
jeopardy.loc[:,'state'] = jeopardy.loc[:,'state'].str.strip()

In [20]:
# map gross winnings to each player 
jeopardy['gross_winnings'] = jeopardy.groupby('full_name')['final_score'].transform(sum)

In [21]:
jeopardy.shape

(3768, 23)

In [22]:
# Incorporate us cities geo information 
uscities = pd.read_csv("../data/raw/uscities.csv", dtype=str)
uscities.head()

Unnamed: 0,city,city_ascii,state_id,state_name,county_fips,county_name,county_fips_all,county_name_all,lat,lng,population,density,source,military,incorporated,timezone,ranking,zips,id
0,South Creek,South Creek,WA,Washington,53053,Pierce,53053,Pierce,46.9994,-122.3921,2500,125,polygon,False,True,America/Los_Angeles,3,98580 98387 98338,1840042075
1,Roslyn,Roslyn,WA,Washington,53037,Kittitas,53037,Kittitas,47.2507,-121.0989,947,84,polygon,False,True,America/Los_Angeles,3,98941 98068 98925,1840019842
2,Sprague,Sprague,WA,Washington,53043,Lincoln,53043,Lincoln,47.3048,-117.9713,441,163,polygon,False,True,America/Los_Angeles,3,99032,1840021107
3,Gig Harbor,Gig Harbor,WA,Washington,53053,Pierce,53053,Pierce,47.3352,-122.5968,9507,622,polygon,False,True,America/Los_Angeles,3,98332 98335,1840019855
4,Lake Cassidy,Lake Cassidy,WA,Washington,53061,Snohomish,53061,Snohomish,48.0639,-122.092,3591,131,polygon,False,True,America/Los_Angeles,3,98223 98258 98270,1840041959


In [23]:
# remove non state values 
jeopardy = pd.merge(jeopardy, uscities, left_on=['city','state'], right_on = ["city","state_name"], how="inner")

In [24]:
jeopardy.shape

(3150, 41)

In [25]:
jeopardy[jeopardy.city == "Los Angeles"].county_fips

214    06037
215    06037
216    06037
217    06037
218    06037
       ...  
308    06037
309    06037
310    06037
311    06037
312    06037
Name: county_fips, Length: 99, dtype: object

In [26]:
jeopardy.head()

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occupation,hometown,archive_info,date,answer_number,...,lng,population,density,source,military,incorporated,timezone,ranking,zips,id_y
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...",27600,"J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26,answer3,...,-106.6464,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176
1,Jason,"Jason Zuffranieri, a math teacher from Albuque...",4400,"J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25,answer3,...,-106.6464,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176
2,Jason,"Jason Zuffranieri, a math teacher from Albuque...",30000,"J! Archive - Show #8043, aired 2019-07-24",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8043,2019-07-24,answer3,...,-106.6464,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...",30100,"J! Archive - Show #8042, aired 2019-07-23",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8042,2019-07-23,answer3,...,-106.6464,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176
4,Jason,"Jason Zuffranieri, a math teacher from Albuque...",18600,"J! Archive - Show #8041, aired 2019-07-22",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8041,2019-07-22,answer3,...,-106.6464,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176


In [27]:
# Flag winner of each game

In [28]:
win_amnts = jeopardy.groupby('date')['final_score'].agg(['max']).reset_index()

In [29]:
win_amnts.rename(columns={"max":"final_score"}, inplace=True)
win_amnts['winner_flg'] = 1
win_amnts

Unnamed: 0,date,final_score,winner_flg
0,2012-10-24,25000,1
1,2014-02-11,10800,1
2,2014-02-14,6800,1
3,2014-02-17,393,1
4,2014-02-24,20800,1
...,...,...,...
1209,2019-07-22,18600,1
1210,2019-07-23,30100,1
1211,2019-07-24,30000,1
1212,2019-07-25,4400,1


In [30]:
# Merge winner ID onto dataset 
jeopardy = pd.merge(jeopardy,win_amnts,on=['date','final_score'],how='left')
jeopardy['winner_flg'] = np.where(jeopardy['winner_flg']==1,1,0)
jeopardy.head(25)

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occupation,hometown,archive_info,date,answer_number,...,population,density,source,military,incorporated,timezone,ranking,zips,id_y,winner_flg
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...",27600,"J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
1,Jason,"Jason Zuffranieri, a math teacher from Albuque...",4400,"J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
2,Jason,"Jason Zuffranieri, a math teacher from Albuque...",30000,"J! Archive - Show #8043, aired 2019-07-24",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8043,2019-07-24,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...",30100,"J! Archive - Show #8042, aired 2019-07-23",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8042,2019-07-23,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
4,Jason,"Jason Zuffranieri, a math teacher from Albuque...",18600,"J! Archive - Show #8041, aired 2019-07-22",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8041,2019-07-22,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
5,Jason,"Jason Zuffranieri, a math teacher from Albuque...",26600,"J! Archive - Show #8040, aired 2019-07-19",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8040,2019-07-19,answer1,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
6,Quin,"Quin Lewellen, a graphic designer from Albuque...",5599,"J! Archive - Show #7692, aired 2018-02-06",Quin Lewellen,a graphic designer,"Albuquerque, New Mexico",J! Archive - Show #7692,2018-02-06,answer1,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
7,Ryan,"Ryan Kious, a social media and web writer from...",799,"J! Archive - Show #7584, aired 2017-07-27",Ryan Kious,a social media and web writer,"Albuquerque, New Mexico",J! Archive - Show #7584,2017-07-27,answer2,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,0
8,Maggie,"Maggie Lehrman, an editor and writer from Broo...",0,"J! Archive - Show #8045, aired 2019-07-26",Maggie Lehrman,an editor and writer,"Brooklyn, New York",J! Archive - Show #8045,2019-07-26,answer2,...,2629150.0,14606,polygon,False,True,America/New_York,1,11212 11213 11216 11233 11238 11209 11214 1122...,1840034030,0
9,E.J.,"E.J. Wolborsky, a freelance project manager fr...",0,"J! Archive - Show #8026, aired 2019-07-01",E.J. Wolborsky,a freelance project manager,"Brooklyn, New York",J! Archive - Show #8026,2019-07-01,answer3,...,2629150.0,14606,polygon,False,True,America/New_York,1,11212 11213 11216 11233 11238 11209 11214 1122...,1840034030,1


In [31]:
jeopardy.groupby(['winner_flg'])['winner_flg'].agg(['count']).reset_index()

Unnamed: 0,winner_flg,count
0,0,1897
1,1,1253


In [32]:
jeopardy.columns

Index(['nickname', 'player_details', 'final_score', 'show_info', 'full_name',
       'occupation', 'hometown', 'archive_info', 'date', 'answer_number',
       'dt_indx', 'id_x', 'created_at', 'favorite_count', 'retweet_count',
       'text', 'game_info_flg', 'anecdote', 'gender', 'clean_text', 'city',
       'state', 'gross_winnings', 'city_ascii', 'state_id', 'state_name',
       'county_fips', 'county_name', 'county_fips_all', 'county_name_all',
       'lat', 'lng', 'population', 'density', 'source', 'military',
       'incorporated', 'timezone', 'ranking', 'zips', 'id_y', 'winner_flg'],
      dtype='object')

In [33]:
jeopardy.head(5)

Unnamed: 0,nickname,player_details,final_score,show_info,full_name,occupation,hometown,archive_info,date,answer_number,...,population,density,source,military,incorporated,timezone,ranking,zips,id_y,winner_flg
0,Jason,"Jason Zuffranieri, a math teacher from Albuque...",27600,"J! Archive - Show #8045, aired 2019-07-26",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8045,2019-07-26,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
1,Jason,"Jason Zuffranieri, a math teacher from Albuque...",4400,"J! Archive - Show #8044, aired 2019-07-25",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8044,2019-07-25,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
2,Jason,"Jason Zuffranieri, a math teacher from Albuque...",30000,"J! Archive - Show #8043, aired 2019-07-24",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8043,2019-07-24,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
3,Jason,"Jason Zuffranieri, a math teacher from Albuque...",30100,"J! Archive - Show #8042, aired 2019-07-23",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8042,2019-07-23,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1
4,Jason,"Jason Zuffranieri, a math teacher from Albuque...",18600,"J! Archive - Show #8041, aired 2019-07-22",Jason Zuffranieri,a math teacher,"Albuquerque, New Mexico",J! Archive - Show #8041,2019-07-22,answer3,...,758523.0,1151,polygon,False,True,America/Denver,2,87121 87120 87123 87112 87113 87110 87111 8711...,1840019176,1


In [34]:
jeopardy_df = jeopardy[['full_name', 'final_score', 'occupation', 'hometown', 'city',
       'state','archive_info', 'date','dt_indx', 'id_x', 'created_at','favorite_count', 'winner_flg',
                        'retweet_count', 'anecdote','text', 'game_info_flg', 'gender', 
                        'clean_text','answer_number']]

### Map State Abbr to data 

In [35]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

jeopardy_df['state_id'] = jeopardy_df['state'].map(us_state_abbrev)
jeopardy_df['state_id']
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0       NM
1       NM
2       NM
3       NM
4       NM
        ..
3145    ME
3146    OH
3147    MO
3148    NM
3149    FL
Name: state_id, Length: 3150, dtype: object

### Process Occupation Categories

In [36]:
jeopardy_df.occupation = jeopardy_df.occupation.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [37]:
jeopardy_df['job'] = jeopardy_df['occupation'].apply(lambda x: " ".join(x.lower() for x in x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [38]:
# stop words
stop = stopwords.words('english')
jeopardy_df['job'] = jeopardy_df['job'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
jeopardy_df['job_category'] = jeopardy_df['job']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [40]:
# Perform initial job categorization 
jeopardy_df.loc[jeopardy_df['job'].str.contains('professor',regex=False), 'job_category'] = 'professor'
jeopardy_df.loc[jeopardy_df['job'].str.contains('teacher',regex=False), 'job_category'] = 'teacher'
jeopardy_df.loc[jeopardy_df['job'].str.contains('educator',regex=False), 'job_category'] = 'teacher'
jeopardy_df.loc[jeopardy_df['job'].str.contains('instructor',regex=False), 'job_category'] = 'teacher'
jeopardy_df.loc[jeopardy_df['job'].str.contains('lecturer',regex=False), 'job_category'] = 'teacher'

#law 
jeopardy_df.loc[jeopardy_df['job'].str.contains('lawyer',regex=False), 'job_category'] = 'lawyer'
jeopardy_df.loc[jeopardy_df['job'].str.contains('attorney',regex=False), 'job_category'] = 'attorney'
jeopardy_df.loc[jeopardy_df['job'].str.contains('judge',regex=False), 'job_category'] = 'lawyer'

#general
jeopardy_df.loc[jeopardy_df['job'].str.contains('consultant',regex=False), 'job_category'] = 'consultant'
jeopardy_df.loc[jeopardy_df['job'].str.contains('writer',regex=False), 'job_category'] = 'writer'
jeopardy_df.loc[jeopardy_df['job'].str.contains('analyst',regex=False), 'job_category'] = 'analyst'
jeopardy_df.loc[jeopardy_df['job'].str.contains('bartender',regex=False), 'job_category'] = 'bartender'
jeopardy_df.loc[jeopardy_df['job'].str.contains('engineer',regex=False), 'job_category'] = 'engineer'
jeopardy_df.loc[jeopardy_df['job'].str.contains('architect',regex=False), 'job_category'] = 'architect'
jeopardy_df.loc[jeopardy_df['job'].str.contains('journalist',regex=False), 'job_category'] = 'journalist'

#library
jeopardy_df.loc[jeopardy_df['job'].str.contains('librarian',regex=False), 'job_category'] = 'librarian'
jeopardy_df.loc[jeopardy_df['job'].str.contains('library',regex=False), 'job_category'] = 'librarian'

# 
jeopardy_df.loc[jeopardy_df['job'].str.contains('nurse',regex=False), 'job_category'] = 'nurse'
jeopardy_df.loc[jeopardy_df['job'].str.contains('author',regex=False), 'job_category'] = 'author'

# college jobs
jeopardy_df.loc[jeopardy_df['job'].str.contains('scientist',regex=False), 'job_category'] = 'scientist'
jeopardy_df.loc[jeopardy_df['job'].str.contains('research',regex=False), 'job_category'] = 'scientist'

# technician
jeopardy_df.loc[jeopardy_df['job'].str.contains('technician',regex=False), 'job_category'] = 'technician'
jeopardy_df.loc[jeopardy_df['job'].str.contains('lab tech',regex=False), 'job_category'] = 'technician'

# chef
jeopardy_df.loc[jeopardy_df['job'].str.contains('chef',regex=False), 'job_category'] = 'chef'

# student categories 
jeopardy_df.loc[jeopardy_df['job'].str.contains('freshman',regex=False), 'job_category'] = 'student'
jeopardy_df.loc[jeopardy_df['job'].str.contains('sophomore',regex=False), 'job_category'] = 'student'
jeopardy_df.loc[jeopardy_df['job'].str.contains('junior',regex=False), 'job_category'] = 'student'
jeopardy_df.loc[jeopardy_df['job'].str.contains('senior',regex=False), 'job_category'] = 'student'
jeopardy_df.loc[jeopardy_df['job'].str.contains('student',regex=False), 'job_category'] = 'student'
# grad students
jeopardy_df.loc[jeopardy_df['job'].str.contains('grad',regex=False), 'job_category'] = 'grad student'
jeopardy_df.loc[jeopardy_df['job'].str.contains('ph.d.',regex=False), 'job_category'] = 'grad student'
jeopardy_df.loc[jeopardy_df['job'].str.contains('ph.d',regex=False), 'job_category'] = 'grad student'

# parent / family 
jeopardy_df.loc[jeopardy_df['job'].str.contains('mom',regex=False), 'job_category'] = 'mom'

# advisor
jeopardy_df.loc[jeopardy_df['job'].str.contains('counselor',regex=False), 'job_category'] = 'advisor'
jeopardy_df.loc[jeopardy_df['job'].str.contains('advisor',regex=False), 'job_category'] = 'advisor'

# manager
jeopardy_df.loc[jeopardy_df['job'].str.contains('manager',regex=False), 'job_category'] = 'manager'
jeopardy_df.loc[jeopardy_df['job'].str.contains('executive',regex=False), 'job_category'] = 'manager'
jeopardy_df.loc[jeopardy_df['job'].str.contains('coordinator',regex=False), 'job_category'] = 'coordinator'
jeopardy_df.loc[jeopardy_df['job'].str.contains('director',regex=False), 'job_category'] = 'director'

# art 
jeopardy_df.loc[jeopardy_df['job'].str.contains('designer',regex=False), 'job_category'] = 'artist/designer'
jeopardy_df.loc[jeopardy_df['job'].str.contains('art',regex=False), 'job_category'] = 'artist/designer'

#specialist
jeopardy_df.loc[jeopardy_df['job'].str.contains('specialist',regex=False), 'job_category'] = 'specialist'

# volunteer
jeopardy_df.loc[jeopardy_df['job'].str.contains('volunteer',regex=False), 'job_category'] = 'volunteer'

# reporter
jeopardy_df.loc[jeopardy_df['job'].str.contains('newspaper',regex=False), 'job_category'] = 'reporter'
jeopardy_df.loc[jeopardy_df['job'].str.contains('reporter',regex=False), 'job_category'] = 'reporter'

# military officer
jeopardy_df.loc[jeopardy_df['job'].str.contains('naval officer',regex=False), 'job_category'] = 'military officer'
jeopardy_df.loc[jeopardy_df['job'].str.contains('u.s. air force',regex=False), 'job_category'] = 'military officer'
jeopardy_df.loc[jeopardy_df['job'].str.contains('u.s. navy',regex=False), 'job_category'] = 'military officer'
jeopardy_df.loc[jeopardy_df['job'].str.contains('army',regex=False), 'job_category'] = 'military officer'
jeopardy_df.loc[jeopardy_df['job'].str.contains('navy',regex=False), 'job_category'] = 'military officer'
jeopardy_df.loc[jeopardy_df['job'].str.contains('coast guard',regex=False), 'job_category'] = 'military officer'

# non profit
jeopardy_df.loc[jeopardy_df['job'].str.contains('nonprofit',regex=False), 'job_category'] = 'nonprofit'

# musicians
jeopardy_df.loc[jeopardy_df['job'].str.contains('music',regex=False), 'job_category'] = 'musician'

# doctors 
jeopardy_df.loc[jeopardy_df['job'].str.contains('doctor',regex=False), 'job_category'] = 'doctor'

# entrepreneur
jeopardy_df.loc[jeopardy_df['job'].str.contains('entrepreneur',regex=False), 'job_category'] = 'entrepreneur'

# Misc
jeopardy_df.loc[jeopardy_df['job'].str.contains('high school',regex=False), 'job_category'] = 'misc education'
jeopardy_df.loc[jeopardy_df['job'].str.contains('health',regex=False), 'job_category'] = 'misc health'
jeopardy_df.loc[jeopardy_df['job'].str.contains('government',regex=False), 'job_category'] = 'misc government'
jeopardy_df.loc[jeopardy_df['job'].str.contains('tax',regex=False), 'job_category'] = 'misc government'

# unemployed 
jeopardy_df.loc[jeopardy_df['job'].str.contains('retired',regex=False), 'job_category'] = 'unemployed'
jeopardy_df.loc[jeopardy_df['job'].str.contains('originally',regex=False), 'job_category'] = 'unemployed'
jeopardy_df.loc[jeopardy_df['job'].str.contains('stay-at-home',regex=False), 'job_category'] = 'unemployed'
jeopardy_df.loc[jeopardy_df['job'].str.contains('year-old',regex=False), 'job_category'] = 'unemployed'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


#### EXPORT AND FINISH WITH MANUAL REVIEW

In [41]:
job_check = jeopardy_df.groupby(["job", "job_category"]).size().reset_index(name="count")
job_check = job_check.sort_values('count')
job_check
#job_check.to_csv('../data/job_check.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path

Unnamed: 0,job,job_category,count
0,10th 11th grade english teacher,grad student,1
763,medical transcriptionist,medical transcriptionist,1
764,medicare appeals coordinator,coordinator,1
765,member services counselor,advisor,1
773,middle school humanities teacher,teacher,1
...,...,...,...
1162,stay-at-home mom,unemployed,32
692,librarian,librarian,33
1095,senior,student,34
1300,writer,writer,42


In [42]:
# Load manual job categories
job_categories = pd.read_csv("../data/processed/job_manual_check.csv")
job_categories.columns

Index(['job', 'job_category', 'job_category_manual', 'count',
       'detailed review', 'updated_category', 'interesting_jobs',
       'final_job_category'],
      dtype='object')

In [43]:
job_categories = job_categories[['job','final_job_category']]

In [44]:
# Merge category onto dataset 
jeopardy_df = pd.merge(jeopardy_df,job_categories,on=['job'])
jeopardy_df.head(5)

Unnamed: 0,full_name,final_score,occupation,hometown,city,state,archive_info,date,dt_indx,id_x,...,anecdote,text,game_info_flg,gender,clean_text,answer_number,state_id,job,job_category,final_job_category
0,Jason Zuffranieri,27600,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,J! Archive - Show #8045,2019-07-26,1,1154934362737205253,...,A movie editor in France has a similar name to...,"7/26/19:\n""My medical school interview was wit...",True,male,A movie editor in France has a similar name to...,answer3,NM,math teacher,teacher,teacher
1,Jason Zuffranieri,4400,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,J! Archive - Show #8044,2019-07-25,1,1154567652465106944,...,It took me seconds to 'win' a game of anti-chess.,"7/25/19:\n""I like cooking with my 2-year-old s...",True,male,It took me seconds to win a game of anti-chess.,answer3,NM,math teacher,teacher,teacher
2,Jason Zuffranieri,30000,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,J! Archive - Show #8043,2019-07-24,1,1154498659637051393,...,"“In Mexico, I was mistaken for Nicolas Cage.”",7/24/19:\n“I played drums in a rhythmic troupe...,True,male,"“In Mexico, I was mistaken for Nicolas Cage.”",answer3,NM,math teacher,teacher,teacher
3,Jason Zuffranieri,30100,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,J! Archive - Show #8042,2019-07-23,1,1154153044772306945,...,My stuffed manatee is a comfort animal in my c...,"7/23/19:\n""I like telling dad jokes.""\n""I run ...",True,male,My stuffed manatee is a comfort animal in my c...,answer3,NM,math teacher,teacher,teacher
4,Jason Zuffranieri,18600,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,J! Archive - Show #8041,2019-07-22,1,1154141731669073920,...,I travel around the world playing Sudoku.,"7/22/19:\n""I explain forensic science to kids ...",True,male,I travel around the world playing Sudoku.,answer3,NM,math teacher,teacher,teacher


In [45]:
jeopardy_df.columns

Index(['full_name', 'final_score', 'occupation', 'hometown', 'city', 'state',
       'archive_info', 'date', 'dt_indx', 'id_x', 'created_at',
       'favorite_count', 'winner_flg', 'retweet_count', 'anecdote', 'text',
       'game_info_flg', 'gender', 'clean_text', 'answer_number', 'state_id',
       'job', 'job_category', 'final_job_category'],
      dtype='object')

In [46]:
jeopardy_df = jeopardy_df[['full_name', 'final_score', 'occupation', 'hometown', 'city', 'state', 'state_id' ,
       'archive_info', 'date', 'dt_indx', 'id_x', 'created_at',
       'favorite_count', 'retweet_count', 'anecdote', 'text', 'game_info_flg','winner_flg',
       'gender', 'clean_text', 'answer_number', 'job', 'final_job_category']]

In [47]:
jeopardy_df.to_csv('../data/processed/clean_jeopardy_data.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path
jeopardy_df['clean_text'].to_csv('../data/processed/tweet_stories.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path