## 020_gen_stats
### General Statistics and Graphics 
### James Wilson

In [104]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import plotly.express as plt # You can also use matplotlib
import plotly.graph_objects as go
pd.options.mode.chained_assignment = None  # default='warn'


In [105]:
# Load Data
jeopardy_df = pd.read_csv("../data/processed/clean_jeopardy_data.csv")
jeopardy_df.columns

Index(['full_name', 'final_score', 'occupation', 'hometown', 'city', 'state',
       'state_id', 'archive_info', 'date', 'dt_indx', 'id_x', 'created_at',
       'favorite_count', 'retweet_count', 'anecdote', 'text', 'game_info_flg',
       'winner_flg', 'gender', 'clean_text', 'answer_number', 'job',
       'final_job_category'],
      dtype='object')

In [106]:
jeopardy_df.head()

Unnamed: 0,full_name,final_score,occupation,hometown,city,state,state_id,archive_info,date,dt_indx,...,retweet_count,anecdote,text,game_info_flg,winner_flg,gender,clean_text,answer_number,job,final_job_category
0,Jason Zuffranieri,27600,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,NM,J! Archive - Show #8045,2019-07-26,1,...,2,A movie editor in France has a similar name to...,"7/26/19:\n""My medical school interview was wit...",True,1,male,A movie editor in France has a similar name to...,answer3,math teacher,teacher
1,Jason Zuffranieri,4400,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,NM,J! Archive - Show #8044,2019-07-25,1,...,1,It took me seconds to 'win' a game of anti-chess.,"7/25/19:\n""I like cooking with my 2-year-old s...",True,1,male,It took me seconds to win a game of anti-chess.,answer3,math teacher,teacher
2,Jason Zuffranieri,30000,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,NM,J! Archive - Show #8043,2019-07-24,1,...,3,"“In Mexico, I was mistaken for Nicolas Cage.”",7/24/19:\n“I played drums in a rhythmic troupe...,True,1,male,"“In Mexico, I was mistaken for Nicolas Cage.”",answer3,math teacher,teacher
3,Jason Zuffranieri,30100,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,NM,J! Archive - Show #8042,2019-07-23,1,...,0,My stuffed manatee is a comfort animal in my c...,"7/23/19:\n""I like telling dad jokes.""\n""I run ...",True,1,male,My stuffed manatee is a comfort animal in my c...,answer3,math teacher,teacher
4,Jason Zuffranieri,18600,a math teacher,"Albuquerque, New Mexico",Albuquerque,New Mexico,NM,J! Archive - Show #8041,2019-07-22,1,...,0,I travel around the world playing Sudoku.,"7/22/19:\n""I explain forensic science to kids ...",True,1,male,I travel around the world playing Sudoku.,answer3,math teacher,teacher


In [107]:
jeopardy_df.groupby('full_name')['full_name'].transform('count')

0       6
1       6
2       6
3       6
4       6
       ..
3142    1
3143    1
3144    1
3145    1
3146    1
Name: full_name, Length: 3147, dtype: int64

### General Analysis

In [108]:
contestants = jeopardy_df.groupby('full_name')['full_name'].agg(['count']).reset_index()
contestants.shape #1,971
contestants.sort_values(by=['count'], ascending=False,inplace=True)
contestants

Unnamed: 0,full_name,count
784,James Holzhauer,31
979,Julia Collins,25
186,Austin Rogers,18
1730,Seth Wilson,13
297,Buzzy Cohen,13
...,...,...
996,Julie Stapel,1
995,Julie Sesnovich,1
994,Julie Roth,1
993,Julie Hornick,1


In [109]:
jeopardy_df['games_played'] = jeopardy_df.groupby('full_name')['full_name'].transform('count')
jeopardy_df['gross_winnings'] = jeopardy_df.groupby('full_name')['final_score'].transform('sum')

In [110]:
import plotly.express as px

sub = jeopardy_df[jeopardy_df['games_played'] < 10]
#sub = jeopardy_df[jeopardy_df['gross_winnings'] < 100000]


In [111]:
fig = px.scatter(sub, x="games_played", y="gross_winnings")
fig.show()

In [112]:
fig = px.box(sub, x="games_played", y="gross_winnings", points="all")

fig.update_layout(
    title_text='Gross Winnings by Number of Games Played'
)

fig.show()

In [113]:
#score_count = jeopardy_df.groupby('job_category')['Final Score'].agg(['sum']).reset_index()
#score_count = score_count.sort_values('sum')
#score_count

In [114]:
jeopardy_df['date'].min() # '2012-10-24'
jeopardy_df['date'].max() # '2019-07-26'

'2019-07-26'

In [115]:
town = jeopardy_df.groupby('hometown')['full_name'].agg(['count']).reset_index()
town.sort_values(by=['count'], ascending=False,inplace=True)
#town.to_csv('../data/state_counts.csv', index = None, header=True) 
town[town.loc[:,'count']==1].shape[0]/town.shape[0]


0.5110081112398609

In [116]:
town2 = jeopardy_df.groupby(['hometown','full_name'])['hometown'].agg(['count']).reset_index()
town2 = town2.groupby(['hometown'])['full_name'].agg(['count']).reset_index()
town2.sort_values(by=['count'], ascending=False,inplace=True)
town2[town2.loc[:,'count']==1].shape[0]/town2.shape[0]
# 71% of contestants are unique to their hometown 

0.7056778679026651

### State Counts

In [117]:
state_counts = jeopardy_df.groupby('state')['full_name'].agg(['count']).reset_index()
state_counts.sort_values(by=['count'], ascending=False,inplace=True)
state_counts.head(10)

Unnamed: 0,state,count
4,California,381
31,New York,316
12,Illinois,217
42,Texas,154
37,Pennsylvania,150
45,Virginia,134
34,Ohio,129
9,Georgia,122
19,Maryland,119
8,Florida,112


In [118]:
state_counts2 = jeopardy_df.groupby(['state_id','full_name'])['state_id'].agg(['count']).reset_index()
state_counts2 = state_counts2.groupby(['state_id'])['full_name'].agg(['count']).reset_index()
state_counts2.sort_values(by=['count'], ascending=False,inplace=True)
state_counts2.head(10)

Unnamed: 0,state_id,count
4,CA,268
33,NY,200
13,IL,125
42,TX,109
44,VA,92
37,PA,92
19,MD,79
8,FL,72
9,GA,72
34,OH,63


In [119]:
fig = px.bar(state_counts2, x='state_id', y='count',color='count',
             labels={'state':'State', 'count':"Count"}, height=400, title="Contestant Count per State")
fig.show()

In [120]:
fig = go.Figure(data=go.Choropleth(
    locations=state_counts2['state_id'], # Spatial coordinates
    z = state_counts2['count'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
    colorbar_title = "Count",
))

fig.update_layout(
    title_text = 'Number of Total Unique Contestants per State',
    geo_scope='usa', # limited map scope to USA
)

fig.show()

In [121]:
# State Winnings 
#state_rev_counts = jeopardy_df.groupby('state')['final_score'].agg(['sum']).reset_index()
#state_rev_counts.sort_values(by=['sum'], ascending=False,inplace=True)
#state_rev_counts.head(10)

#### Gender statistics 

In [122]:
jeopardy_df.gender.value_counts()

male             1354
female           1169
unknown           231
mostly_male       184
mostly_female     176
andy               33
Name: gender, dtype: int64

In [123]:
jeopardy_df['gender_clean'] = jeopardy_df['gender']

jeopardy_df.loc[jeopardy_df["gender"]=='mostly_male',"gender_clean"] = "male"
jeopardy_df.loc[jeopardy_df["gender"]=='mostly_female',"gender_clean"] = "female"
jeopardy_df.loc[jeopardy_df["gender"]=='andy',"gender_clean"] = "unknown"

gender_tbl = pd.DataFrame(jeopardy_df.gender_clean.value_counts()).reset_index()
gender_tbl

Unnamed: 0,index,gender_clean
0,male,1538
1,female,1345
2,unknown,264


In [124]:
#jeopardy_df.groupby(['gender_clean'])['winner_flg'].agg(['sum'])

In [125]:
fig_jobwin = px.bar(gender_tbl, x='index', y='gender_clean', color='index',
             labels={'index':'Gender', 'gender_clean':"Count"}, height=400, text='gender_clean',
    color_discrete_map={
        'male': '#F08B52',
        'female': '#A074D9',
        'unknown': '#EFDD01'
    },title="Contestant Gender Distribution")
fig_jobwin.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig_jobwin.update_yaxes(tick0=0, dtick=250)
fig_jobwin.update_layout(showlegend=False)
fig_jobwin.update_xaxes(tickangle=45)
fig_jobwin.update_layout(plot_bgcolor='white')
fig_jobwin.show()

In [126]:
from scipy.stats import ttest_ind

cat1 = jeopardy_df[jeopardy_df['gender_clean']=='male']
cat1 = cat1.groupby(['full_name'])['winner_flg'].agg(['sum']).reset_index()
cat1['winner_flg'] = np.where(cat1['sum'] > 0, 1, 0)
cat1['winner_flg'].agg(['sum'])

cat2 = jeopardy_df[jeopardy_df['gender_clean']=='female']
cat2 = cat2.groupby(['full_name'])['winner_flg'].agg(['sum']).reset_index()
cat2['winner_flg'] = np.where(cat2['sum'] > 0, 1, 0)
cat2['winner_flg'].agg(['sum'])

#ttest_ind(cat1['winner_flg'], cat2['winner_flg'])


sum    260
Name: winner_flg, dtype: int64

## Job Categories

In [127]:
jeopardy_df['job'].nunique()

1317

In [128]:
jeopardy_df['final_job_category'].nunique()

33

In [129]:
job_count = jeopardy_df.groupby('final_job_category')['full_name'].agg(['count']).reset_index()
job_count = job_count.sort_values('count')
job_count.head(5)

Unnamed: 0,final_job_category,count
11,laborer,16
15,marketing,17
6,coordinator,20
17,misc education,24
25,salesperson,24


In [130]:
# attach job category counts
#job_scores = jeopardy_df.groupby(['final_job_category'])['full_name'].agg(['sum']).reset_index()
#job_scores.sort_values(by=['sum'], ascending=False, inplace=True)
#job_scores.head(20)

In [131]:
unique_jobs = jeopardy_df.groupby('final_job_category')['final_job_category'].agg(['count']).reset_index()
unique_jobs.head(5)

Unnamed: 0,final_job_category,count
0,accounting/ business/ finance,88
1,actor/artist/designer,97
2,administration/ advisory,75
3,analyst,88
4,assistant/ service industry,91


In [132]:
avg_job_scores = jeopardy_df.groupby(['final_job_category'])['final_score'].agg(['median']).reset_index()
avg_job_scores.sort_values(by=['median'], ascending=False, inplace=True)
avg_job_scores.head(5)

Unnamed: 0,final_job_category,median
0,accounting/ business/ finance,16850.0
21,miscellaneous,16200.0
22,musician,14700.0
4,assistant/ service industry,13200.0
24,professor,13199.5


In [133]:
jeopardy_df['games_played'] = jeopardy_df.groupby('full_name')['full_name'].transform('count')
avg_job_scores = jeopardy_df.groupby(['final_job_category','games_played'])['final_score'].agg(['mean']).reset_index()
avg_job_scores.sort_values(by=['games_played','mean'], ascending=False, inplace=True)
avg_job_scores.head(5)

Unnamed: 0,final_job_category,games_played,mean
105,miscellaneous,31,75608.032258
6,accounting/ business/ finance,25,20385.714286
104,miscellaneous,25,9400.0
30,assistant/ service industry,18,27547.222222
58,grad student,13,20262.0


In [134]:
## WHAT JOB CATEGORY SEEMS TO APPEAR ON JEOPARDY OFTEN (OUTSIDE TEACHERS / STUDENTS)

In [135]:
full_names_and_jobs = jeopardy_df.groupby(['final_job_category','full_name'])['final_job_category'].agg(['count']).reset_index()
people_job_count = full_names_and_jobs.groupby(['final_job_category'])['final_job_category'].agg(['count']).reset_index()
people_job_count.sort_values(by=['count'], ascending=False, inplace=True)
people_job_count.head(10)

Unnamed: 0,final_job_category,count
31,unemployed,196
29,teacher,180
28,student,145
14,manager,138
12,lawyer,134
32,writer,133
10,grad student,98
1,actor/artist/designer,62
26,scientist/ researcher,56
7,director,55


In [136]:
## WHAT JOB CATEGORY DOES BEST IN TERMS OF WINNING ? 

In [137]:
winner_jobs = jeopardy_df.groupby(['final_job_category','full_name'])['winner_flg'].agg(['sum']).reset_index()
winner_jobs['winner_flg'] = np.where(winner_jobs['sum'] > 0, 1, 0)
winner_jobs = winner_jobs.groupby(['final_job_category'])['winner_flg'].agg(['sum']).reset_index()
winner_jobs.sort_values(by=['sum'], ascending=False, inplace=True)
winner_jobs.head(15)

Unnamed: 0,final_job_category,sum
29,teacher,64
31,unemployed,62
28,student,57
32,writer,45
12,lawyer,45
10,grad student,34
14,manager,32
24,professor,21
1,actor/artist/designer,20
2,administration/ advisory,18


In [138]:
# Versus overall # that come onto SHOW 

In [139]:
jobwin = pd.merge(people_job_count,winner_jobs, on='final_job_category')
jobwin['win_ratio'] = jobwin['sum']/jobwin['count']
jobwin.sort_values(by=['win_ratio'], ascending=True, inplace=True)
jobwin.head(10)

Unnamed: 0,final_job_category,count,sum,win_ratio
29,coordinator,17,3,0.176471
21,physician,38,8,0.210526
13,specialist,52,11,0.211538
3,manager,138,32,0.231884
12,technician/ programmer,53,14,0.264151
16,engineer/ architect,48,13,0.270833
9,director,55,15,0.272727
27,misc education,18,5,0.277778
26,salesperson,18,5,0.277778
8,scientist/ researcher,56,17,0.303571


In [140]:
fig_jobwin = px.bar(jobwin, x='final_job_category', y='win_ratio',color='win_ratio',
             labels={'final_job_category':'Job Category', 'win_ratio':"Win Ratio"}, height=400, 
                    title="Win Ratio per Job Category")
fig_jobwin.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig_jobwin.update_yaxes(tick0=0, dtick=0.1)
fig_jobwin.update_xaxes(tickangle=45)
fig_jobwin.update_layout(plot_bgcolor='white')
fig_jobwin.show()

#### Winners

In [141]:
#jeopardy_df.iloc[np.where(jeopardy_df['full_name']== 'Julia Collins')]

In [142]:
#jeopardy_df.iloc[np.where(jeopardy_df['full_name']== 'Chip Brookes')]

### Graphic of State Contestant Count vs winner_flg 

In [143]:
#stateid_counts = pd.DataFrame(jeopardy_df.groupby(['state_id','full_name'])['state_id','winner_flg'].agg(['sum']).reset_index())
#stateid_counts.columns = [
#'_'.join(col).rstrip('_') for col in stateid_counts.columns.values
#]
#stateid_counts.to_csv('../data/output/sum_tab1.csv', index = None, header=True) 

stateid_counts = pd.DataFrame(jeopardy_df.groupby(['state_id','full_name'])['winner_flg'].agg(['sum']).reset_index())
stateid_counts['winner_flg_bin'] = np.where(stateid_counts['sum'] > 0, 1, 0)
stateid_counts

Unnamed: 0,state_id,full_name,sum,winner_flg_bin
0,AK,Chip Brookes,0,0
1,AK,Oliver Bundy,0,0
2,AK,Sarah Sinclair,0,0
3,AK,Thea Lawton,1,1
4,AL,Allan Ashley,0,0
...,...,...,...,...
1971,WV,Tyler Morrison,0,0
1972,WY,Dave Rowswell,0,0
1973,WY,Justin Earnshaw,0,0
1974,WY,Pat Greiner,1,1


In [144]:
stateid_counts.groupby(['winner_flg_bin'])['winner_flg_bin'].agg(['count']).reset_index()

Unnamed: 0,winner_flg_bin,count
0,0,1341
1,1,635


In [145]:
tbl1 = stateid_counts.groupby(['state_id'])['winner_flg_bin'].agg(['sum']).reset_index()
tbl2 = stateid_counts.groupby(['state_id'])['full_name'].agg(['count']).reset_index()

tbl1.columns = ['state_id','winner_cnt']
tbl2.columns = ['state_id','player_cnt']

state_winner_stats = pd.merge(tbl1,tbl2,on=['state_id'])
state_winner_stats['winner_ratio'] = state_winner_stats['winner_cnt']/state_winner_stats['player_cnt']
state_winner_stats.head(5)

Unnamed: 0,state_id,winner_cnt,player_cnt,winner_ratio
0,AK,1,4,0.25
1,AL,9,21,0.428571
2,AR,8,14,0.571429
3,AZ,7,25,0.28
4,CA,76,268,0.283582


In [146]:
fig = go.Figure(data=go.Choropleth(
    locations=state_winner_stats['state_id'], # Spatial coordinates
    z = state_winner_stats['winner_ratio'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Blues',
    colorbar_title = "Winner Ratio",
))

fig.update_layout(
    title_text = 'Proportion of Winners out of Total Contestants per State',
    geo_scope='usa', # limited map scope to USA
)

fig.show()

#### TEXT Analysis 

In [147]:
from textblob import TextBlob
def senti_p(x):
    return TextBlob(x).sentiment.polarity  
def senti_s(x):
    return TextBlob(x).sentiment.subjectivity  

jeopardy_df['polarity_score'] = jeopardy_df['clean_text'].apply(senti_p)
jeopardy_df['subjectivity_score'] = jeopardy_df['clean_text'].apply(senti_s)
#Polarity is a float that lies between [-1,1], -1 indicates negative sentiment and +1 indicates positive sentiments. 
#Subjectivity is also a float which lies in the range of [0,1]. Subjective sentences generally refer to personal opinion, 
#emotion, or judgment. 

In [148]:
jeopardy_df['polarity_score']
pol_means = jeopardy_df.groupby(['full_name'])['polarity_score'].agg(['mean']).reset_index()
pol_means.columns = ['full_name','polarity_avg']
pol_means

Unnamed: 0,full_name,polarity_avg
0,Aaron Benor,0.000000
1,Aaron Ellis,0.000000
2,Aaron Lichtig,0.250000
3,Abby Roughton,0.000000
4,Abigail Myers,0.045455
...,...,...
1963,Zelda Pulliam,0.000000
1964,Zia Jones,0.000000
1965,Ziad Ali,0.000000
1966,Zlatan Hodzic,0.000000


In [149]:
jeopardy_df['subjectivity_score']
sub_means = jeopardy_df.groupby(['full_name'])['subjectivity_score'].agg(['mean']).reset_index()
sub_means.columns = ['full_name','subjectivity_avg']
sub_means

Unnamed: 0,full_name,subjectivity_avg
0,Aaron Benor,0.000000
1,Aaron Ellis,0.000000
2,Aaron Lichtig,0.250000
3,Abby Roughton,0.000000
4,Abigail Myers,0.151515
...,...,...
1963,Zelda Pulliam,0.000000
1964,Zia Jones,0.000000
1965,Ziad Ali,0.050000
1966,Zlatan Hodzic,0.000000


In [150]:
jeopardy_df = pd.merge(jeopardy_df, pol_means, on='full_name')
jeopardy_df = pd.merge(jeopardy_df, sub_means, on='full_name')
jeopardy_df.columns

Index(['full_name', 'final_score', 'occupation', 'hometown', 'city', 'state',
       'state_id', 'archive_info', 'date', 'dt_indx', 'id_x', 'created_at',
       'favorite_count', 'retweet_count', 'anecdote', 'text', 'game_info_flg',
       'winner_flg', 'gender', 'clean_text', 'answer_number', 'job',
       'final_job_category', 'games_played', 'gross_winnings', 'gender_clean',
       'polarity_score', 'subjectivity_score', 'polarity_avg',
       'subjectivity_avg'],
      dtype='object')

In [151]:
jeopardy_df['polarity_avg'].agg(['max','mean','median','min']) 

max       1.000000
mean      0.041945
median    0.000000
min      -1.000000
Name: polarity_avg, dtype: float64

In [152]:
jeopardy_df['subjectivity_avg'].agg(['max','mean','median','min']) 

max       1.000000
mean      0.151361
median    0.050000
min       0.000000
Name: subjectivity_avg, dtype: float64

### Regression model 

In [165]:
#import statsmodels.api as sm
#from statsmodels.formula.api import ols
#from statsmodels.sandbox.regression.predstd import wls_prediction_std
#from sklearn import linear_model

#reg_model_dta = jeopardy_df[['final_score','gender_cat','city_cat', 'state_cat', 'job_cat','polarity_score','subjectivity_score']]
#reg_model_dta.drop_duplicates(inplace=True) #inplace=True
#reg_model_dta.head(5)

### ATTEMPT LOGISTIC MODEL (WIN/ DON'T WIN)

In [166]:
# Assume independence of games (i.e. winner doesn't have any benefit in next game)

In [167]:
# create a flag for if a player one at least one game 
win_tab = jeopardy_df.groupby(['full_name'])['winner_flg'].agg(['sum']).reset_index()
win_tab['wins_atleast_once'] = np.where(win_tab['sum']>0, 1, 0)
jeopardy_df = pd.merge(jeopardy_df,win_tab[['full_name','wins_atleast_once']], on= 'full_name')

In [193]:
# prepare variables 
# try gender_clean 
jeopardy_df['gender_clean'] = jeopardy_df['gender_clean'].astype('category')
jeopardy_df['gender_cat'] = jeopardy_df['gender_clean'].cat.codes

jeopardy_df['state'] = jeopardy_df['state'].astype('category')
jeopardy_df['state_cat'] = jeopardy_df['state'].cat.codes

jeopardy_df['city'] = jeopardy_df['city'].astype('category')
jeopardy_df['city_cat'] = jeopardy_df['city'].cat.codes

jeopardy_df['final_job_category'] = jeopardy_df['final_job_category'].astype('category')
jeopardy_df['job_cat'] = jeopardy_df['final_job_category'].cat.codes


In [194]:
# create subset of deduped data 
model_dta = jeopardy_df[['full_name','wins_atleast_once','gender_cat','city_cat', 'state_cat', 'job_cat','polarity_avg','subjectivity_avg']]
model_dta.drop_duplicates(inplace=True) #inplace=True
model_dta

Unnamed: 0,full_name,wins_atleast_once,gender_cat,city_cat,state_cat,job_cat,polarity_avg,subjectivity_avg
0,Jason Zuffranieri,1,1,4,30,29,0.166667,0.20
6,Kevin Paquette,0,1,124,45,29,0.000000,0.00
7,Rand Wise,1,0,172,9,29,0.000000,0.00
8,Julien Corven,0,1,545,19,29,0.000000,0.00
9,Quin Lewellen,1,2,4,30,1,0.160000,0.54
...,...,...,...,...,...,...,...,...
3142,Julie Hornick,0,0,713,39,13,0.000000,0.00
3143,Kirsten Albair,0,0,104,18,21,0.000000,0.00
3144,Dawn Volmert,0,0,735,24,21,0.000000,0.00
3145,Chris Sánchez,0,1,119,30,29,-0.400000,0.60


In [195]:
model_dta['wins_atleast_once'].agg(['sum'])

sum    650
Name: wins_atleast_once, dtype: int64

In [196]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sn
import matplotlib.pyplot as plt

In [197]:
X = model_dta[['gender_cat','city_cat', 'state_cat', 'job_cat','polarity_avg','subjectivity_avg']]
y = model_dta['wins_atleast_once']

In [198]:
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
columns = X_train.columns
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

length of oversampled data is  1992
Number of no subscription in oversampled data 996
Number of subscription 996
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [199]:
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.635686
         Iterations 5
                         Results: Logit
Model:              Logit             Pseudo R-squared: -0.005   
Dependent Variable: wins_atleast_once AIC:              2535.6749
Date:               2021-02-10 12:23  BIC:              2569.2352
No. Observations:   1985              Log-Likelihood:   -1261.8  
Df Model:           5                 LL-Null:          -1255.2  
Df Residuals:       1979              LLR p-value:      1.0000   
Converged:          1.0000            Scale:            1.0000   
No. Iterations:     5.0000                                       
-----------------------------------------------------------------
                   Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------
gender_cat         0.1202   0.0716  1.6786 0.0932 -0.0202  0.2606
city_cat          -0.0007   0.0002 -3.8653 0.0001 -0.0010 -0.0003


In [190]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [191]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.68


In [192]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[339   0]
 [158   0]]


In [178]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.68      1.00      0.81       404
          1       0.00      0.00      0.00       192

avg / total       0.46      0.68      0.55       596




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.



In [None]:
## Attempt Random Forest 

In [180]:
from sklearn.datasets import make_classification
# evaluate random forest algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
# define dataset
#X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)
# define the model
model = RandomForestClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.708 (0.024)


<sklearn.model_selection._split.RepeatedStratifiedKFold at 0x1ceae96afc8>

### MISC PLots

In [None]:
# Graph State Winnings 
import plotly.graph_objects as go

state_rev_counts = pd.DataFrame(state_rev_counts)

fig = go.Figure(data=go.Choropleth(
    locations=state_rev_counts['state_id'], # Spatial coordinates
    z = state_rev_counts['sum'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
    colorbar_title = "Millions USD",
))

fig.update_layout(
    title_text = 'Gross Winnings by State',
    geo_scope='usa', # limite map scope to USA
)

fig.show()