In [None]:
# Import needed packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# Import dataset from file directory
WV2015F = pd.read_excel("WV2015F.xlsx")

In [None]:
WV2015F.shape

In [None]:
WV2015F.head()

In [None]:
# Create another object holding the data and print the names of the file's variables (columns)
dataf = WV2015F[:]


In [None]:
dataf.shape

In [None]:
#Create dataset including the variables needed for the analysis

dataf1 = dataf.iloc[:, np.r_[2:20,21,22:32,45,60:68,136:168,178,180,182]]
                    

In [None]:
dataf1.shape

In [None]:
dataf1.columns

In [None]:
# Calculate the median for each variable
dataf1.median().head()

In [None]:
# replace missing values (In tnis case with median, also can be replaced with mean, mode, based on the analysis)
dataf2 = dataf1.fillna(dataf1.median())


In [None]:
dataf2.shape

In [None]:
dataf2.iloc[:, np.r_[4:8,18]].describe()

In [None]:
# Combine Life.Sat and Happy into a single variable 
# but first recode Happy so 4='High Happy' (to make both variables show a 'better' score if higher)
# and then rescale Happy from 4 point scale to 10 point scale.

In [None]:
#create a dictionaty to recode values (see we use curled brackets to define a dictionary data structure)
d = {1:4, 2:3, 3:2, 4:1}
dataf2['Happy'] = dataf2['Happy'].apply(lambda x:d[x])

In [None]:
dataf2.Happy

In [None]:
Life_Happy = (((10/4)*dataf2.Happy)+dataf2['Life Sat'])/2
print(Life_Happy.describe())
len(Life_Happy)

In [None]:
# plot histogram to see distribution of the variable
# plt.hist(Life_Happy, bins='auto')   # in this case the number of bins is calculated by the app
plt.hist(Life_Happy, bins=20) # bins can be an integer (# of equal bins)
                              # a sequence that defines edges
                              # or a string that identifies a mode (like 'auto')


In [None]:
# Add Life.Happy to the dataset and remove Life.Sat and Happy
dataf3 = pd.concat([dataf2, Life_Happy.rename('Life_Happy')], axis=1)


In [None]:
dataf3.shape

In [None]:
dataf3.drop(columns=['Life Sat', 'Happy'], inplace=True)

In [None]:
dataf3.shape

In [None]:
dataf3.head()

REGRESSION TREE EXAMPLE

In [None]:
# Create random training and testing datasets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataf3.iloc[:,:-1], 
                                                    Life_Happy, test_size=0.33, 
                                                    random_state=42)

In [None]:
from sklearn import tree
reg_mod = tree.DecisionTreeRegressor(max_depth=3)   #create a model (we are using default hyperparameters)
reg_mod = reg_mod.fit(X_train, y_train)  #train the model
print(reg_mod)                           #print the model hyperparameters

In [None]:
# use cross validation to assess the model, make change to hyperparameters and compare models
# we don't want to use the test data for fine tunning the model, since it will introduce noise 
# when measuring the final model (we need new data, not seen by the model for that)
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(reg_mod,X_test,y_test,cv=5)

In [None]:
# we can plot the tree
tree.plot_tree(reg_mod) 

In [None]:
# Use model to predict DV for new data
reg_mod.predict(X_test)

In [None]:
# Check the accuracy of the prediction by calculating 
# (1) the correlation between the predicted and actual values and 
# (2) the mean of the squared errors
# here we use the test set 

In [None]:
import sklearn.metrics

In [None]:
np.corrcoef(y_test, reg_mod.predict(X_test))

In [None]:
sklearn.metrics.mean_squared_error(y_test, reg_mod.predict(X_test))

In [None]:
# we can also find the best model comparing different quality measures
scoring = ['neg_mean_squared_error', 'r2']
sklearn.model_selection.cross_validate(reg_mod, X_test, y_test, cv=10, scoring=scoring)

In [None]:
# here a list of the available options for scoring (not all apply for each model)
sorted(sklearn.metrics.SCORERS.keys())

In [None]:
# To save preprocessed data we can use pandas, here save to csv file
dataf3.to_csv('/Users/catalina/Documents/jupyter_files/1 Psy ML 2020 Spring /wvs4.xcsv')

In [None]:
#descriptive statistics
dataf3.describe()

RANDOM FOREST REGRESSION

In [None]:
from sklearn.ensemble import RandomForestRegressor as rf

In [None]:
cross_val_score(rf(n_estimators=50),X_train,y_train, cv=5)

In [None]:
# we can use cross validation to select hyperparameters.
# for example we can observe how results change with different max depth
# and pick max depth based on results
for i in range(1,10):
    print(i, cross_val_score(rf(n_estimators=50, max_depth=i),X_train,y_train, cv=5).mean())

In [None]:
# or fix depth and change n_estimators
for i in [25,50,75,100,200]:
    print(i, cross_val_score(rf(n_estimators=i, max_depth=8),X_train,y_train, cv=5).mean())

In [None]:
# Redefine the model, train, predict 
rf = sklearn.ensemble.RandomForestRegressor(max_depth=9, n_estimators=75)
rf.fit(X_train, y_train)
rf.predict(X_test)

In [None]:
# Evaluate the results usint test set
# Correlation
np.corrcoef(y_test, rf.predict(X_test))

In [None]:
#MSE
sklearn.metrics.mean_squared_error(y_test, rf.predict(X_test))

In [None]:
rf.feature_importances_

RANDOM FOREST CLASSIFICATION EXAMPLE

In [None]:
# Add SES (Socioeconomic status) variable to the dataset to have a classification variable to predict
ses1 = WV2015F['SocialClass'][:]
ses1.describe()

In [None]:
ses1.isnull().sum()

In [None]:
ses1.fillna(int(ses1.median()), inplace=True)

In [None]:
ses1.describe()

In [None]:
dataf4 = pd.concat([dataf3, ses1.rename('Ses')], axis=1)

In [None]:
# training and test sets
X_train, X_test, y_train, y_test = train_test_split(dataf4.iloc[:,:-1], 
                                                    ses1, 
                                                    test_size=0.33, 
                                                    random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier as rfc

In [None]:
cross_val_score(rfc(n_estimators=50),X_train,y_train, cv=5)

In [None]:
for i in range(5,50,5):
    print(i, cross_val_score(rfc(n_estimators=50, max_depth=i),X_train,y_train, cv=5).mean())

In [None]:
# Redefine the model, train, predict 
rfc = sklearn.ensemble.RandomForestRegressor(max_depth=10, n_estimators=50)
rfc.fit(X_train, y_train)
rfc.predict(X_test)

In [None]:
rfc.get_params