# SPEED DATING EXPERIMENT

### DS-GA-23 Final Project
#### Miranda Remmer


****

In [235]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model, feature_selection, neighbors, metrics, grid_search, cross_validation


%matplotlib inline
plt.style.use('ggplot')

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 20)

In [236]:
df_raw = pd.read_csv(os.path.join('..', 'CODE', 'speed-dating-experiment', 'Speed Dating Data.csv'))

df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


In [237]:
df_raw.columns

Index([u'iid', u'id', u'gender', u'idg', u'condtn', u'wave', u'round',
       u'position', u'positin1', u'order',
       ...
       u'attr3_3', u'sinc3_3', u'intel3_3', u'fun3_3', u'amb3_3', u'attr5_3',
       u'sinc5_3', u'intel5_3', u'fun5_3', u'amb5_3'],
      dtype='object', length=195)

In [238]:
df_raw

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,22.0,1,44,2,21,22,14,10.0,5,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8374,552,22.0,1,44,2,21,22,13,10.0,4,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8375,552,22.0,1,44,2,21,22,19,10.0,10,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0
8376,552,22.0,1,44,2,21,22,3,10.0,16,...,8.0,5.0,7.0,6.0,7.0,9.0,5.0,9.0,5.0,6.0


*******

## Creating New Dataset with Desired Columns


In [239]:
subset_df = df_raw[['iid', 'gender', 'round', 'wave','pid', 
                    'samerace', 'imprace', 'imprelig', 
                    'age', 'age_o', 
                    'match', 'dec', 'dec_o', 'condtn', 
                    'exphappy', 'expnum', 'goal', 'match_es', 
                    'like', 'prob', 'like_o', 'prob_o', 
                    'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 
                    'attr_o', 'sinc_o', 'intel_o', 'fun_o','amb_o','shar_o', 
                    'attr3_1', 'sinc3_1', 'fun3_1', 'intel3_1', 'amb3_1', 
                    'attr3_2', 'sinc3_2', 'fun3_2', 'intel3_2', 'amb3_2', 
                    'attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1', 
                    'attr5_2', 'sinc5_2', 'intel5_2', 'fun5_2', 'amb5_2', 
                    'attr3_s', 'sinc3_s', 'intel3_s', 'fun3_s', 'amb3_s', 
                    'attr1_1', 'sinc1_1', 'fun1_1', 'intel1_1', 'amb1_1', 'shar1_1', 
                    'attr1_2', 'sinc1_2', 'fun1_2', 'intel1_2', 'amb1_2', 'shar1_2', 
                    'pf_o_att','pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha', 
                    'attr1_s', 'sinc1_s', 'fun1_s', 'intel1_s', 'amb1_s', 'shar1_s', 
                    'attr7_2', 'sinc7_2', 'fun7_2', 'intel7_2', 'amb7_2', 'shar7_2', 
                    'attr4_1', 'sinc4_1', 'fun4_1', 'intel4_1', 'amb4_1', 'shar4_1', 
                    'attr4_2', 'sinc4_2', 'fun4_2', 'intel4_2', 'amb4_2', 'shar4_2', 
                    'attr2_1', 'sinc2_1', 'fun2_1', 'intel2_1', 'amb2_1', 'shar2_1', 
                    'attr2_2', 'sinc2_2', 'fun2_2', 'intel2_2', 'amb2_2', 'shar2_2']]

subset_df

Unnamed: 0,iid,gender,round,wave,pid,samerace,imprace,imprelig,age,age_o,...,fun2_1,intel2_1,amb2_1,shar2_1,attr2_2,sinc2_2,fun2_2,intel2_2,amb2_2,shar2_2
0,1,0,10,1,11.0,0,2.0,4.0,21.0,27.0,...,20.0,15.0,5.0,5.0,,,,,,
1,1,0,10,1,12.0,0,2.0,4.0,21.0,22.0,...,20.0,15.0,5.0,5.0,,,,,,
2,1,0,10,1,13.0,1,2.0,4.0,21.0,22.0,...,20.0,15.0,5.0,5.0,,,,,,
3,1,0,10,1,14.0,0,2.0,4.0,21.0,23.0,...,20.0,15.0,5.0,5.0,,,,,,
4,1,0,10,1,15.0,0,2.0,4.0,21.0,24.0,...,20.0,15.0,5.0,5.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,1,22,21,526.0,0,1.0,1.0,25.0,26.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8374,552,1,22,21,527.0,0,1.0,1.0,25.0,24.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8375,552,1,22,21,528.0,0,1.0,1.0,25.0,29.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8376,552,1,22,21,529.0,0,1.0,1.0,25.0,22.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0


In [240]:
subset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 113 entries, iid to shar2_2
dtypes: float64(104), int64(9)
memory usage: 7.2 MB


### Renaming Features:

In [241]:
def renameFeature(feature, new_feature_name, df):
    df.rename(columns={feature: new_feature_name}, inplace = True)

In [242]:
#renaming 'round' to 'met_count'

renameFeature('round', 'met_count', subset_df)
subset_df['met_count']

0       10
1       10
2       10
3       10
4       10
        ..
8373    22
8374    22
8375    22
8376    22
8377    22
Name: met_count, dtype: int64

### Reassign 


In [None]:
#df = df.set_index('iid')

### Feature Info:

******

### Seperating Dataset into Two datasets: 1 Female; 1 Male

In [243]:
df_female_raw = subset_df[subset_df.gender == 0]
df_female_raw

Unnamed: 0,iid,gender,met_count,wave,pid,samerace,imprace,imprelig,age,age_o,...,fun2_1,intel2_1,amb2_1,shar2_1,attr2_2,sinc2_2,fun2_2,intel2_2,amb2_2,shar2_2
0,1,0,10,1,11.0,0,2.0,4.0,21.0,27.0,...,20.0,15.0,5.0,5.0,,,,,,
1,1,0,10,1,12.0,0,2.0,4.0,21.0,22.0,...,20.0,15.0,5.0,5.0,,,,,,
2,1,0,10,1,13.0,1,2.0,4.0,21.0,22.0,...,20.0,15.0,5.0,5.0,,,,,,
3,1,0,10,1,14.0,0,2.0,4.0,21.0,23.0,...,20.0,15.0,5.0,5.0,,,,,,
4,1,0,10,1,15.0,0,2.0,4.0,21.0,24.0,...,20.0,15.0,5.0,5.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7889,530,0,22,21,548.0,0,1.0,1.0,22.0,30.0,...,10.0,10.0,10.0,20.0,30.0,20.0,15.0,10.0,10.0,15.0
7890,530,0,22,21,549.0,0,1.0,1.0,22.0,28.0,...,10.0,10.0,10.0,20.0,30.0,20.0,15.0,10.0,10.0,15.0
7891,530,0,22,21,550.0,0,1.0,1.0,22.0,30.0,...,10.0,10.0,10.0,20.0,30.0,20.0,15.0,10.0,10.0,15.0
7892,530,0,22,21,551.0,0,1.0,1.0,22.0,27.0,...,10.0,10.0,10.0,20.0,30.0,20.0,15.0,10.0,10.0,15.0


In [244]:
df_male_raw = subset_df[subset_df.gender == 1]
df_male_raw

Unnamed: 0,iid,gender,met_count,wave,pid,samerace,imprace,imprelig,age,age_o,...,fun2_1,intel2_1,amb2_1,shar2_1,attr2_2,sinc2_2,fun2_2,intel2_2,amb2_2,shar2_2
100,11,1,10,1,1.0,0,7.0,3.0,27.0,21.0,...,20.0,20.0,25.0,5.0,,,,,,
101,11,1,10,1,2.0,1,7.0,3.0,27.0,24.0,...,20.0,20.0,25.0,5.0,,,,,,
102,11,1,10,1,3.0,1,7.0,3.0,27.0,25.0,...,20.0,20.0,25.0,5.0,,,,,,
103,11,1,10,1,4.0,1,7.0,3.0,27.0,23.0,...,20.0,20.0,25.0,5.0,,,,,,
104,11,1,10,1,5.0,1,7.0,3.0,27.0,21.0,...,20.0,20.0,25.0,5.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,1,22,21,526.0,0,1.0,1.0,25.0,26.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8374,552,1,22,21,527.0,0,1.0,1.0,25.0,24.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8375,552,1,22,21,528.0,0,1.0,1.0,25.0,29.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0
8376,552,1,22,21,529.0,0,1.0,1.0,25.0,22.0,...,30.0,0.0,0.0,20.0,50.0,5.0,20.0,10.0,5.0,10.0


# Compress Features Within Dataset to Get Averages & Sums

### Function for Sums:

In [245]:
def getSum(feature, df):
    for iid in range(len(df)):
        sum = df.groupby(['iid'])[feature].sum()
        return sum    

### Function to Get Averages of a Feature Set:

In [246]:
def getAve(feature, divs, df):
    for iid in range(len(df)):
        ave = df.groupby(['iid'])[[feature, divs]].mean()
        return ave

#divs = denominator ##RENAME

### Function to Grab 1 Row of Data:

In [247]:
def getValueSet(feature, df):
    for iid in range(len(df)):
        denom = df.groupby(['iid'])[feature].value_counts()
        sum = df.groupby(['iid'])[feature].sum()
        divis = sum/denom
        return divis

In [248]:
## Test part of formula

df_male_raw.groupby(['iid'])['gender'].value_counts()

iid  gender
11   1         10
12   1         10
13   1         10
14   1         10
15   1         10
               ..
548  1         22
549  1         22
550  1         22
551  1         22
552  1         22
dtype: int64

In [249]:
df_male_raw.set_index(['iid'], inplace=True)

In [250]:
gender = getFeature('gender', df_male_raw)
df_male_raw[['gender']]

Unnamed: 0_level_0,gender
iid,Unnamed: 1_level_1
11,1
11,1
11,1
11,1
11,1
...,...
552,1
552,1
552,1
552,1


In [None]:
def getFeature(feature,df):
    for feature in range(len(df)):
        return feature


In [None]:
iid1 = df_female_raw[subset_df.iid == 1]
iid1

In [None]:
iid1 = iid1[iid1.columns[0:19]]
iid1

In [None]:
iid1['dec_sum'] = dec_sum[getSum('dec', iid1)]

In [None]:
iid1

In [None]:
pd.concat([iid1, dec_sum])

In [None]:
iid1['dec_sum']

In [None]:
def ConvertDF(DF, df):  #DF = old dataframe    #df = new data frame
    for iid in range(len(df)):
        df['iid'] = DF[getValueSet('iid', DF)]
        df['gender'] = DF[getValueSet('gender', DF)]
        df['met_count'] = DF[getValueSet('met_count', DF)]
        df['condtn'] = DF[getValueSet('condtn', DF)]
        df['exphappy'] = DF[getValueSet('exphappy', DF)]
        df['expnum'] = DF[getValueSet('expnum', DF)]
        df['match_es'] = DF[getValueSet('match_es', DF)]
        df['goal'] = DF[getValueSet('goal', DF)]
        
        df = pd.concat(['iid', 'gender', 'met_count','condtn','exphappy','expnum', 'match_es', 'goal'])



In [None]:
IID1 = pd.DataFrame({})

In [None]:
ConvertDF(iid1, IID1)

In [None]:
IID1

### *Female Dataset*

#### feature info here


**'Yes' Sum (#of decision = yes):**

**dec_o sum = sum of 'yes' per men for women**:

### *Male Dataset*

In [None]:
df_male = pd.DataFrame({})

## Set index to iid

In [None]:
df = df.set_index('iid')

In [None]:
def ConvertDF(DF, df):  #DF = old dataframe    #df = new data frame
    for iid in range(len(df)):
        df['iid'] = DF[getValueSet('iid', DF)]
        df['gender'] = DF[getValueSet('gender', DF)]
        df['met_count'] = DF[getValueSet('met_count', DF)]
        df['condtn'] = DF[getValueSet('condtn', DF)]
        df['exphappy'] = DF[getValueSet('exphappy', DF)]
        df['expnum'] = DF[getValueSet('expnum', DF)]
        df['match_es'] = DF[getValueSet('match_es', DF)]
        df['goal'] = DF[getValueSet('goal', DF)]
        
        attr3_1 = getValueSet('attr3_1', DF)
        sinc3_1 = getValueSet('sinc3_1', DF)
        fun3_1 = getValueSet('fun3_1', DF)
        intel3_1 = getValueSet('intel3_1', DF)
        amb3_1= getValueSet('amb3_1', DF)
        
        attr3_2 = getValueSet('attr3_2', DF)
        sinc3_2 = getValueSet('sinc3_2', DF)
        fun3_2 = getValueSet('fun3_2', DF)
        intel3_2= getValueSet('intel3_2', DF)
        amb3_2 = getValueSet('amb3_2', DF)
        
        attr5_1 = getValueSet('attr5_1', DF)
        sinc5_1 = getValueSet('sinc5_1', DF)
        fun5_1 = getValueSet('fun5_1', DF)
        intel5_1 = getValueSet('intel5_1', DF)
        amb5_1 = getValueSet('amb5_1', DF)
        
        attr5_2 = getValueSet('attr5_2', DF)
        sinc5_2 = getValueSet('sinc5_2', DF)
        fun5_2 = getValueSet('fun5_2', DF)
        intel5_2 = getValueSet('intel5_2', DF)
        amb5_2 = getValueSet('amb5_2', DF)
        
        attr3_s = getValueSet('attr3_s', DF)
        sinc3_s = getValueSet('sinc3_s', DF)
        fun3_s = getValueSet('fun3_s', DF)
        intel3_s = getValueSet('intel3_s', DF)
        amb3_s = getValueSet('amb3_s', DF)
        
        attr1_1 = getValueSet('attr1_1', DF)
        sinc1_1 = getValueSet('sinc1_1', DF)
        fun1_1 = getValueSet('fun1_1', DF)
        intel1_1 = getValueSet('intel1_1', DF)
        amb1_1 = getValueSet('amb1_1', DF)
        shar1_1 = getValueSet('shar1_1', DF)
        
        attr1_s = getValueSet('attr1_s', DF)
        sinc1_s = getValueSet('sinc1_s', DF)
        fun1_s = getValueSet('fun1_s', DF)
        intel1_s = getValueSet('intel1_s', DF)
        amb1_s = getValueSet('amb1_s', DF)
        shar1_s = getValueSet('shar1_s', DF)
        
        attr1_2 = getValueSet('attr1_2', DF)
        sinc1_2 = getValueSet('sinc1_2', DF)
        fun1_2 = getValueSet('fun1_2', DF)
        intel1_2 = getValueSet('intel1_2', DF)
        amb1_2 = getValueSet('amb1_2', DF)
        shar1_2 = getValueSet('shar1_2', DF)
        
        attr7_2 = getValueSet('attr7_2', DF)
        sinc7_2 = getValueSet('sinc7_2', DF)
        fun7_2 = getValueSet('fun7_2', DF)
        intel7_2 = getValueSet('intel7_2', DF)
        amb7_2 = getValueSet('amb7_2', DF)
        shar7_2 = getValueSet('shar7_2', DF)
        
        attr4_1 = getValueSet('attr4_1', DF)
        sinc4_1 = getValueSet('sinc4_1', DF)
        fun4_1 = getValueSet('fun4_1', DF)
        intel4_1 = getValueSet('intel4_1', DF)
        amb4_1 = getValueSet('amb4_1', DF)
        shar4_1 = getValueSet('shar4_1', DF)
        
        attr4_2 = getValueSet('attr4_2', DF)
        sinc4_2 = getValueSet('sinc4_2', DF)
        fun4_2 = getValueSet('fun4_2', DF)
        intel4_2 = getValueSet('intel4_2', DF)
        amb4_2 = getValueSet('amb4_2', DF)
        shar4_2 = getValueSet('shar4_2', DF)
        
        attr2_1 = getValueSet('attr2_1', DF)
        sinc2_1 = getValueSet('sinc2_1', DF)
        fun2_1 = getValueSet('fun2_1', DF)
        intel2_1= getValueSet('intel2_1', DF)
        amb2_1 = getValueSet('amb2_1', DF)
        shar2_1 = getValueSet('shar2_1', DF)
        
        attr2_2 = getValueSet('attr2_2', DF)
        sinc2_2 = getValueSet('sinc2_2', DF)
        fun2_2 = getValueSet('fun2_2', DF)
        intel2_2 = getValueSet('intel2_2', DF)
        amb2_2 = getValueSet('amb2_2', DF)
        shar2_2 = getValueSet('shar2_2', DF)
        
        #RENAME ALL BELOW
        match_sum = getSum('match', DF)
        dec_sum = getSum('dec', DF) #sum of subject's decisions (num of 'yes'')
        dec_o_sum = getSum('dec_o', DF) #sum of parnter's decisions (num of 'yes'')
        
        
        match_es_ave = getAve('match_es', 'met_count', DF)  #MATCH_ES_AVE  = % of people they think they'll match with
        expnum_ave =getAve('expnum', 20, DF)  #EXPNUM_AVE (/20ppl)
        exphappy_ave = getAve('exphappy', 10, DF)  #EXPHAPPY_AVE (/10PTS) ##????
        like_ave = getAve('like', 'met_count', DF)  #LIKE_AVE ##?
        like_o_ave = getAve('like_o', 'met_count', DF)  #LIKE_O_AVE ##?
        prob_ave = getAve('prob', 'met_count', DF)  #PROB_AVE ##?
        prob_o_ave = getAve('prob_o', 'met_count', DF)  #PROB_O_AVE ##?
        
        dec_ave = getAve('dec', 'met_count', DF)  #DEC_AVE (average decision of subject to want to date)
        dec_o_ave = getAve('dec_o', 'met_count', DF)  #DEC_AVE (average dec of partners to want to date subject)
        match_ave = getAve('match', 'met_count', DF) #MATCH_AVE (average match count/#of people met with)
        
        attr_ave = getAve('attr', 'met_count', DF) 
        sinc_ave = getAve('sinc', 'met_count', DF)  
        intel_ave = getAve('intel', 'met_count', DF) 
        fun_ave = getAve('fun', 'met_count', DF) 
        amb_ave = getAve('amb', 'met_count', DF) 
        shar_ave = getAve('shar', 'met_count', DF) 
        
        attr_o_ave = getAve('attr_o', 'met_count', DF) 
        sinc_o_ave = getAve('sinc_o', 'met_count', DF)  
        intel_o_ave = getAve('intel_o', 'met_count', DF) 
        fun_o_ave = getAve('fun_o', 'met_count', DF) 
        amb_o_ave = getAve('amb_o', 'met_count', DF) 
        shar_o_ave = getAve('shar_o', 'met_count', DF) 
        
        pf_o_att_ave = getAve('pf_o_att', 'met_count', DF) 
        pf_o_sin_ave = getAve('pf_o_sin', 'met_count', DF)  
        pf_o_int_ave = getAve('pf_o_int', 'met_count', DF) 
        pf_o_fun_ave = getAve('pf_o_fun', 'met_count', DF) 
        pf_o_amb_ave = getAve('pf_o_amb', 'met_count', DF) 
        pf_o_sha_ave = getAve('pf_o_sha', 'met_count', DF) 
        
        
        
        DF = pd.concat(iid, gender, met_count, condtn, exphappy, expnum, match_es, goal)
        DF = pd.concat(attr3_1, sinc3_1, fun3_1, intel3_1, amb3_1)
        DF = pd.concat(attr3_2, sinc3_2, fun3_2, intel3_2, amb3_2)
        DF = pd.concat(attr5_1, sinc5_1, fun5_1, intel5_1, amb5_1)
        DF = pd.concat(attr5_2, sinc5_2, fun5_2, intel5_2, amb5_2)
        DF = pd.concat(attr3_s, sinc3_s, fun3_s, intel3_s, amb3_s)
        DF = pd.concat(attr1_1, sinc1_1, fun1_1, intel1_1, amb1_1, shar1_1)
        DF = pd.concat(attr1_s, sinc1_s, fun1_s, intel1_s, amb1_s, shar1_s)
        DF = pd.concat(attr1_2, sinc1_2, fun1_2, intel1_2, amb1_2, shar1_2)
        DF = pd.concat(attr7_2, sinc7_2, fun7_2, intel7_2, amb7_2, shar7_2)
        DF = pd.concat(attr4_1, sinc4_1, fun4_1, intel4_1, amb4_1, shar4_1)
        DF = pd.concat(attr4_2, sinc4_2, fun4_2, intel4_2, amb4_2, shar4_2)
        DF = pd.concat(attr2_1, sinc2_1, fun2_1, intel2_1, amb2_1, shar2_1)
        DF = pd.concat(attr2_2, sinc2_2, fun2_2, intel2_2, amb2_2, shar2_2)    
        DF = pd.concat(match_sum, dec_sum, dec_o_sum)
        DF = pd.concat(match_es_ave, expnum_ave, exphappy_ave, like_ave, like_o_ave)
        DF = pd.concat(prob_ave, prob_o_ave, dec_ave, dec_o_ave, match_ave)
        DF = pd.concat(attr_ave, sinc_ave, intel_ave, fun_ave, amb_ave, shar_ave)
        DF = pd.concat(attr_o_ave, sinc_o_ave, intel_o_ave, fun_o_ave, amb_o_ave, shar_o_ave)
        DF = pd.concat(pf_o_att_ave, pf_o_sin_ave, pf_o_int_ave, pf_o_fun_ave, pf_o_amb_ave, pf_o_sha_ave)
       
    
        
      
            
  

 #DF = pd.DataFrame({})
#DF = DF.set_index('iid')



#df = pd.concat([df_1, df_6])




## drop outliers? drop nan values? (for each compressed feature)?

EG:
    

len(subset_df.match_es)

len(subset_df.match_es.dropna())

subset_df.match_es.dropna(inplace = True) #removing NaN values

len(subset_df.match_es)

EG:

Q1 = df.Newspaper.quantile(0.25)
Q3 = df.Newspaper.quantile(0.75)

IQR = Q3 - Q1

IQR


df.drop(df[df.Newspaper > Q3 + 1.5 * IQR].index, inplace = True)

len(df)