In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [3]:
from scipy.stats import skew,kurtosis

In [4]:
import os

In [5]:
os.chdir('../pickles')

In [6]:
numerical=pickle.load(open('numerical.pickle','rb'))
categorical = pickle.load(open('categorical.pickle','rb'))

In [7]:
os.chdir('../output_data')

In [8]:
df = pd.read_csv('train_v03.csv',header=0)

In [9]:
df.set_index('PassengerId',inplace=True)

In [10]:
def edd(data):
    df_desc = data.describe().transpose()
    df_desc['Var'] = df_desc.index
    df_desc.reset_index(inplace=True)
    df_desc.drop('count',axis=1,inplace=True)
    df_desc['skewness'] = df_desc['Var'].apply(lambda x: skew(np.array(data.loc[data[x].notnull(),x])))
    df_desc['kurtosis'] = df_desc['Var'].apply(lambda x: kurtosis(np.array(data.loc[data[x].notnull(),x]),fisher=False))
    df_desc['99%'] = df_desc['Var'].apply(lambda x: data[x].quantile(.99))
    df_desc['95%'] = df_desc['Var'].apply(lambda x: data[x].quantile(.95))
    df_desc['90%'] = df_desc['Var'].apply(lambda x: data[x].quantile(.90))
    df_desc['10%'] = df_desc['Var'].apply(lambda x: data[x].quantile(.1))
    df_desc['5%'] = df_desc['Var'].apply(lambda x: data[x].quantile(.05))
    df_desc['1%'] = df_desc['Var'].apply(lambda x: data[x].quantile(.01))
    df_desc['mean-3sigma'] = df_desc['mean'] - 3*df_desc['std']
    df_desc['mean+3sigma'] = df_desc['mean'] + 3*df_desc['std']
    df_desc['mean-2sigma'] = df_desc['mean'] - 2*df_desc['std']
    df_desc['mean+2sigma'] = df_desc['mean'] + 2*df_desc['std']
    df_desc['type']='numeric'
    
    def is_category(col):
        return 'float' not in str(data[col].dtype) and 'int' not in str(data[col].dtype)
    
    categorical = [col for col in data.columns if is_category(col)]
    df_categorical = pd.DataFrame()
    df_categorical['Var']=np.array(categorical)
    
    df_categorical['type']='categorical'
    for col in [c for c in df_desc.columns if c not in ['Var','type']]:
        df_categorical[col]=np.nan
    for col in categorical:
        df_var = data[col].value_counts()
        df_cat = pd.DataFrame()
        df_cat['count']=df_var
        df_cat['categories']=df_var.index
        df_cat.reset_index(inplace=True)
        df_cat.sort_values(by='count',ascending=False,inplace=True)
        df_cat.set_index('categories',inplace=True)
        index_list = df_cat.index.tolist()
        for i,c in enumerate(['mean','min','1%','5%','10%','25%']):
            try:
                df_categorical.loc[df_categorical['Var']==col,c] = index_list[i]
            except:
                break
        for i,c in enumerate(['50%','75%','90%','95%','99%','max']):
            try:
                df_categorical.loc[df_categorical['Var']==col,c] = index_list[-(i+1)]
            except:
                break
        del df_var
        del df_cat
        del index_list
    df_categorical = df_categorical[df_desc.columns]
    edd = pd.concat([df_desc,df_categorical])
    del df_desc
    del df_categorical
    edd['count'] = edd['Var'].apply(lambda x: data[data[x].notnull()].shape[0])
    edd['nmiss'] = data.shape[0]-edd['count']
    edd['missing_rate'] = np.array(edd['nmiss']).astype('float')/data.shape[0] * 100
    edd['unique'] = edd['Var'].apply(lambda x: len(data[x].value_counts().index.tolist()))
    orig_cols = ['mean','min','1%','5%','10%','25%','50%','75%','90%','95%','99%','max']
    new_cols = ['mean_or_top1','min_or_top2','p1_or_top3','p5_or_top4','p10_or_top5','p25_or_top6',
                'p50_or_bottom6','p75_or_bottom5','p90_or_bottom4','p95_or_bottom3','p99_or_bottom2','max_or_bottom1']
    
    convert_dict = {}
    for i in range(len(orig_cols)):
        convert_dict[orig_cols[i]]=new_cols[i]
    edd.rename(columns=convert_dict,inplace=True)
    edd = edd[['Var','type','count','nmiss','missing_rate','unique','std','skewness','kurtosis','mean-3sigma',
               'mean-2sigma','mean_or_top1','min_or_top2','p1_or_top3','p5_or_top4','p10_or_top5','p25_or_top6',
               'p50_or_bottom6','p75_or_bottom5','p90_or_bottom4','p95_or_bottom3','p99_or_bottom2','max_or_bottom1'
              ,'mean+2sigma','mean+3sigma']]
    return edd

In [11]:
edd =edd(df)

In [12]:
from scipy.stats import chisquare
from statsmodels.stats.weightstats import ztest

In [13]:
def p_value_stat(col,target='Survived'):
    if col==target:
        return None
    try:
        if edd.loc[edd['Var']==col,'type'].values[0]=='categorical':
            obs_freq = []
            for cat in df[col].value_counts().index.tolist():
                obs_class=[]
                for i in df[target].value_counts().index.tolist():
                    obs = df[(df[col]==str(cat)) & (df[target]==i)].shape[0]
                    obs_class.append(obs)
                obs_freq.append(obs_class)
            obs_freq=np.array(obs_freq)
            return chisquare(obs_freq,axis=0)[1][0]
        else:
            return ztest(np.array(df.loc[(df[col].notnull()) & (df[target]==1),col]),
                        np.array(df.loc[(df[col].notnull()) & (df[target]==0),col]))[1]
    except Exception as e:
        print(e,col)
        return np.nan

In [14]:
edd['Anova/Chisquare pvalue'] = edd['Var'].apply(lambda x: p_value_stat(x))

In [15]:
os.chdir('../Statistics')

In [16]:
edd.to_csv('edd_after_treatment.csv',index=False)

In [17]:
cols_to_drop =list(edd.loc[edd['unique']<2,'Var'])

In [18]:
df.drop(cols_to_drop,axis=1,inplace=True)

In [19]:
def inter_correlation_clusters(data,cutoff=.7):
    correlations=data.corr()
    graph={}
    columns=data.columns
    for i in range(len(columns)):
        graph[i]=[]
        for j in range(len(columns)):
            if i!=j and np.abs(correlations.iloc[i,j])>cutoff:
                graph[i].append(j)
    
    tree_set={}
    component = 0
    visited = [0 for i in range(len(columns))]
    def dfs(i):
        visited[i]=1
        try:
            tree_set[component].append(i)
        except KeyError:
            tree_set[component] = [i]
            
        for j in graph[i]:
            if visited[j]==0:
                dfs(j)
                
    tree_cluster={}
    for key in list(tree_set.keys()):
        tree_cluster[key] = [columns[i] for i in tree_set[key]]
        
    return tree_cluster

In [20]:
def varclus(data):
    columns = []
    correlations=data.corr()
    clusters= inter_correlation_clusters(data)
    print(clusters)
    
    cols = list(data.columns)
    
    def distance(c1,c2):
        return np.max([[np.abs(correlations.loc[i,j]) for i in clusters[c1]] for j in clusters[c2]])
    
    def next_closest(c):
        minima=0
        point=c
        for c1 in [i for i in list(clusters.keys()) if i!=c]:
            dist = distance(c,c1)
            if dist>minima:
                minima=dist
                point=c1
        return point
    
    def get_squared_ratio(col,own_cluster,next_cluster):
        y = np.array(data[col])
        x = np.array(data[own_cluster].drop(col,axis=1))
        model=LinearRegression()
        model.fit(x,y)
        y_pred = list(model.predict(x))
        del x
        del model
        r2_own = r2_score(y,y_pred)
        del y_pred
        x = np.array(data[next_cluster])
        model=LinearRegression()
        model.fit(x,y)
        y_pred = list(model.predict(x))
        del x
        del model
        r2_next = r2_score(y,y_pred)
        del y
        del y_pred
        
        return float(1-r2_own)/(1-r2_next)
    
    for c1 in list(clusters.keys()):
        if len(clusters[c1])>1:
            own_cluster = clusters[c1]
            next_cluster = clusters[next_closest(c1)]
            ration=np.inf
            for col in clusters[c1]:
                col_ratio = get_squared_ratio(col,own_cluster,next_cluster)
                if col_ratio<ratio:
                    ratio=col_ratio
                    clust_col=col
            columns.append(clust_col)
        else:
            columns.append(clusters[c1][0])
            
    return columns

In [21]:
columns = varclus(df.drop(['Survived'],axis=1))

{}


In [22]:
for col in df.columns:
    if True in np.isinf(np.array(df[col])):
        print col

# Variance Inflation Factor

In [23]:
df.shape

(712, 23)

In [24]:
from patsy import dmatrices
import statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [25]:
def variance_inflation(data):
    vif = pd.DataFrame()
    vif['features'] = data.columns
    vif['vif factor'] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    vif.sort_values(by='vif factor',ascending=False,inplace=True)
    vif.reset_index(inplace=True)
    vif.drop(['index'],axis=1,inplace=True)
    print(vif)
    return tuple(vif.loc[0,:].values)

In [26]:
vif_drop_cols=[]

In [27]:
def vif_reduction(data,limit=2.5):
    vif = variance_inflation(data)
    if vif[1]<=limit:
        return
    else:
        data.drop(vif[0],axis=1,inplace=True)
        vif_drop_cols.append(vif[0])
        print(vif[0]+' dropped')
        del vif
        vif_reduction(data,limit=limit)

In [28]:
vif_reduction(df.drop(['Survived'],axis=1))

              features  vif factor
0                  Age    4.876435
1   Pclass_dum_1 class    4.671065
2         Sex_dum_male    3.846066
3   Pclass_dum_3 class    3.536203
4                 Fare    2.957105
5                SibSp    1.876113
6                Parch    1.855157
7          Title_dum_0    1.823725
8         Ticket_dum_0    1.749816
9          Cabin_dum_1    1.649530
10        Ticket_dum_1    1.635919
11         Title_dum_1    1.580751
12         Cabin_dum_0    1.568212
13      Embarked_dum_0    1.396590
14        Ticket_dum_2    1.263407
15      Embarked_dum_1    1.245544
16         Title_dum_2    1.192343
17         Cabin_dum_2    1.190940
18        Ticket_dum_3    1.190451
19         Title_dum_3    1.115498
20        Ticket_dum_4    1.112128
21         Cabin_dum_3    1.020477
Age dropped
              features  vif factor
0   Pclass_dum_1 class    4.084489
1   Pclass_dum_3 class    3.184053
2         Sex_dum_male    3.037451
3                 Fare    2.938777
4       

In [29]:
len(vif_drop_cols)

3

In [30]:
os.chdir('../Statistics')

In [31]:
edd = pd.read_csv('edd_v03.csv',header=0)

In [32]:
edd.loc[edd['Var'].isin(vif_drop_cols),'Status']='drop'

In [33]:
edd['Reason']=''
edd.loc[edd['Var'].isin(vif_drop_cols),'Reason']='VIF'

In [34]:
edd.to_csv('edd_v04.csv',index=False)

In [35]:
df.drop(vif_drop_cols,axis=1,inplace=True)

In [36]:
os.chdir('../output_data')

In [37]:
df.to_csv('train_v04.csv')

# BackWard Selection

In [38]:
import statsmodels.discrete.discrete_model as sm
from sklearn.preprocessing import StandardScaler

In [39]:
flag=0
cols_dropped=['Survived']
while flag==0:
    model = sm.Logit(endog=np.array(df['Survived']),exog=np.array(df.drop(cols_dropped,axis=1)))
    results = model.fit()
    pvalues=list(results.pvalues)
    drop_index = pvalues.index(max(pvalues))
    col_drop = df.drop(cols_dropped,axis=1).columns[drop_index]
    print(col_drop+'-'+str(pvalues[drop_index]))
    if pvalues[drop_index]>.05:
        cols_dropped.append(col_drop)
    else:
        flag=1

         Current function value: 0.420185
         Iterations: 35
Cabin_dum_3-0.998449263888
Optimization terminated successfully.
         Current function value: 0.420427
         Iterations 7
Embarked_dum_1-0.967796654326
Optimization terminated successfully.
         Current function value: 0.420428
         Iterations 7
Ticket_dum_1-0.693436882623
Optimization terminated successfully.
         Current function value: 0.420538
         Iterations 7
Ticket_dum_3-0.646769403207
Optimization terminated successfully.
         Current function value: 0.420684
         Iterations 7
Title_dum_3-0.501292330786
Optimization terminated successfully.
         Current function value: 0.421042
         Iterations 6
Ticket_dum_4-0.370179391733
Optimization terminated successfully.
         Current function value: 0.421565
         Iterations 6
Cabin_dum_2-0.304046812961
Optimization terminated successfully.
         Current function value: 0.422300
         Iterations 6
Title_dum_2-0.24328430782



In [40]:
cols_dropped.remove('Survived')

In [41]:
len(cols_dropped)

10

In [42]:
df.shape

(712, 20)

In [43]:
df.drop(cols_dropped,axis=1,inplace=True)

In [44]:
df.columns

Index([u'SibSp', u'Parch', u'Fare', u'Survived', u'Sex_dum_male',
       u'Cabin_dum_0', u'Cabin_dum_1', u'Embarked_dum_0', u'Title_dum_0',
       u'Title_dum_1'],
      dtype='object')

In [45]:
os.chdir('../output_data')

In [46]:
df.to_csv('train_v05.csv')

In [47]:
os.chdir('../pickles')

In [48]:
pickle.dump(vif_drop_cols,open('vif_drop_cols.pickle','wb'))

In [49]:
pickle.dump(cols_dropped,open('backward_drop_cols.pickle','wb'))

In [50]:
df.shape

(712, 10)

In [51]:
df['Survived']

PassengerId
675    0
467    0
598    0
733    0
414    0
272    1
303    0
180    0
634    0
264    0
807    0
278    0
816    0
379    0
873    0
327    0
844    0
819    0
203    0
372    0
655    0
144    0
412    0
826    0
130    0
805    1
785    0
885    0
132    0
722    0
      ..
270    1
374    0
326    1
196    1
306    1
709    1
499    0
298    0
610    1
269    1
333    0
857    1
319    1
690    1
780    1
731    1
378    0
701    1
381    1
717    1
558    0
119    0
743    1
89     1
28     0
342    1
439    0
259    1
738    1
680    1
Name: Survived, Length: 712, dtype: int64

In [52]:
os.chdir('../Statistics')

In [53]:
edd = pd.read_csv('edd_v04.csv',header=0)

In [54]:
edd.loc[edd['Var'].isin(cols_dropped),'Status']='drop'

In [55]:
edd.loc[edd['Var'].isin(cols_dropped),'Reason']='Backward Selection'

In [56]:
edd.to_csv('edd_v05.csv',index=False)

In [57]:
os.chdir('../pickles')

In [58]:
numericals_final = list(set(set(df.columns) & set(numerical)))

In [59]:
len(numericals_final)

3

In [60]:
numericals_final

['Fare', 'SibSp', 'Parch']

In [61]:
dummies_final = [col for col in df.columns if 'dum' in col]

In [62]:
len(dummies_final)

6

In [63]:
pickle.dump(numericals_final,open('numerical_final.pickle','wb'))
pickle.dump(dummies_final,open('dummies_final.pickle','wb'))

# PCA

In [64]:
from sklearn.decomposition import PCA

In [65]:
from sklearn.preprocessing import StandardScaler

In [66]:
scale = StandardScaler()

In [67]:
scale.fit(np.array(df.drop(['Survived'],axis=1)))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [68]:
pickle.dump(scale,open('scale.pickle','wb'))

In [69]:
x = scale.transform(df.drop(['Survived'],axis=1))

In [70]:
df.shape

(712, 10)

In [71]:
pca = PCA(n_components=9)

In [72]:
pca.fit(x)

PCA(copy=True, iterated_power='auto', n_components=9, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [73]:
variance_cumulative = np.cumsum(pca.explained_variance_ratio_)

In [74]:
for i in range(61):
    if variance_cumulative[i]>=.9:
        n_components=i+1
        break

In [75]:
n_components

7

In [76]:
pca = PCA(n_components=n_components)

In [77]:
pca.fit(x)

PCA(copy=True, iterated_power='auto', n_components=7, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [78]:
pickle.dump(pca,open('pca.pickle','wb'))

In [79]:
df_pca_weights = pd.DataFrame(pca.components_,columns=df.drop(['Survived'],axis=1).columns,index=['pc'+str(i)
                                                                                                  for i in range(1,n_components+1)])

In [80]:
df_pca_magnitude_weights = pd.DataFrame(map(np.square,pca.components_),columns=df.drop(['Survived'],axis=1).columns,index=['pc'+str(i)
                                                                                                  for i in range(1,n_components+1)])

In [81]:
df_pca_weights

Unnamed: 0,SibSp,Parch,Fare,Sex_dum_male,Cabin_dum_0,Cabin_dum_1,Embarked_dum_0,Title_dum_0,Title_dum_1
pc1,0.351064,0.43567,0.351451,-0.461055,0.205323,0.163297,0.099388,0.381673,0.355641
pc2,-0.334634,-0.226143,0.427547,0.076164,0.446825,0.502386,0.387484,-0.170992,-0.129833
pc3,0.380274,0.351322,0.180598,0.502213,0.188102,-0.028858,-0.123751,-0.584203,0.233043
pc4,-0.306225,-0.050381,-0.203745,-0.240879,-0.15823,-0.139371,0.410489,-0.359597,0.679618
pc5,0.224437,0.116116,0.258822,0.169093,-0.386473,-0.30481,0.709698,0.061336,-0.30143
pc6,0.040679,0.128619,-0.027753,0.076327,-0.660474,0.727672,-0.062649,-0.054001,0.051316
pc7,-0.57322,0.406784,0.497357,-0.139511,-0.229868,-0.242796,-0.303164,-0.141829,-0.123713


In [82]:
os.chdir('../PCA')

In [83]:
df_pca_weights.to_csv('pca_weights.csv')

In [84]:
df_pca_magnitude_weights.to_csv('pca_magnitude_weights.csv')

In [85]:
df_pca = pd.DataFrame(pca.transform(x),columns=['pc'+str(i) for i in range(1,n_components+1)])

In [86]:
df_pca['Survived']=df['Survived']

In [87]:
os.chdir('../output_data')

In [88]:
df_pca.index = df.index

In [89]:
df_pca['Survived'].isnull().any()

True

In [90]:
df_pca.to_csv('train_pca.csv')