In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, balanced_accuracy_score, r2_score, mean_squared_log_error
from pandas.plotting import scatter_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.feature_selection import SelectKBest, f_regression, chi2

In [40]:
train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/titanic_train.csv"))

In [41]:
features = list(train_data)
X = train_data[features]
# X = X.drop('SalePrice', axis = 1)
y = train_data['Survived']

In [42]:
# check for numeric features train_data
num_features = []
cat_features = []
for feature in X:
    if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
        num_features.append(feature)
    else:
        cat_features.append(feature)

In [43]:
# X_new.info()
# X = X[num_features]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [44]:
X = pd.concat([X[cat_features], y], axis = 1, sort = False)

In [27]:
X[cat_features]

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S
...,...,...,...,...,...
886,"Montvila, Rev. Juozas",male,211536,,S
887,"Graham, Miss. Margaret Edith",female,112053,B42,S
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,,S
889,"Behr, Mr. Karl Howell",male,111369,C148,C


In [46]:
X.to_json('data/miscellaneous/train_Xcat_ycat.json')
X.to_csv('data/miscellaneous/train_Xcat_ycat.csv')
# X[cat_features].to_json('data/miscellaneous/test_Xnum_ynum.json')
# X[cat_features].to_csv('data/miscellaneous/test_Xnum_ynum.csv')

In [22]:
cat_features = []
if cat_features:
    print("hi")
else:
    print("yom")

yom


In [23]:
# check for numeric features train_data
train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/miscellaneous/pure_num_data.csv"))
target = 'SalePrice'
ordinal_feature = 'OverallQual'

features = list(train_data)
X = train_data[features]
y = train_data[target]

num_features = []
cat_features = []
for feature in X:
    if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
        num_features.append(feature)
    else:
        cat_features.append(feature)

X = X.reset_index()
# if num_features is not empty
# impute using only numerical features
if num_features:
    imp = IterativeImputer(max_iter = 10, random_state = 42)
    imp.fit(X[num_features])
    X[num_features] = imp.transform(X[num_features])
    X_num = X.drop(cat_features, axis = 1)

# if cat_features is not empty
# impute using only categorical features
if cat_features:
    imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    X[cat_features] = imp.fit_transform(X[cat_features].astype(str))
    X_cat = X.drop(num_features, axis = 1)

# saleprice correlation matrix
k_num = round(len(X_num.columns) / 2)
corrmat = X_num.corr()
X_num_fs = corrmat.nlargest(k_num, target)[target].index

# check for multicollinearity
# if two features are strongly correlated with each other (>= 0.7) 
# the feature with the lower correlation with the target variable is dropped
multicorr = {}
k = len(corrmat)
for feature in corrmat:
    i = 1
    if feature != target:
        while i < k - 1:
            if corrmat[feature][i] >= 0.7 and feature != corrmat.index[i]:
                multicorr[feature] = corrmat.index[i], corrmat[feature][i]
            i = i + 1
    
# delete duplicates
corr_scores = []
for feature in list(multicorr.keys()):
    if multicorr[feature][1] in corr_scores:
        del multicorr[feature]
    else:
        corr_scores.append(multicorr[feature][1])
        
# remove the feature with the lower correlation coefficient (pearson)
dropped_features = [] 
for feature1, feature2 in multicorr.items():
    if corrmat[target][feature1] < corrmat[target][feature2[0]]:
        dropped_features.append(feature1)
    else:
        dropped_features.append(feature2[0])

# drop the features from X_num dataframe
for feature in X_num:
    if feature in dropped_features:
        X_num = X_num.drop(feature, axis = 1) 
X_num.drop(X_num.columns.difference(X_num_fs), 1, inplace = True)

drop_multicoll_features = dropped_features
drop_corr_features = X_num.columns.difference(X_num_fs)

# encode ordinal features (dummy variables)
ord_data = [ordinal_feature]
X_num = pd.get_dummies(X_num, columns = ord_data, drop_first = True)

# if cat_features is not empty
# encode categorical features
if cat_features:
    enc = OrdinalEncoder()
    enc.fit(X_cat)
    X_cat_enc = enc.transform(X_cat)
    
    # feature selection on categorical data
    k_cat = round(len(X_cat.columns) / 2)
    fs = SelectKBest(f_regression, k_cat)
    fs.fit(X_cat_enc, y) # save!!
    X_cat_fs = fs.transform(X_cat_enc)
    X_cat_enc = pd.DataFrame(X_cat_fs)
    
    # if cat_features and num_features are not empty
    # concatenate numerical and categorical features
    if cat_features and num_features:
        df_cat = pd.DataFrame(X_cat_enc, index = list(range(len(X.index))))
        df_num = pd.DataFrame(X_num, index = list(range(len(X.index))))
        X = pd.concat([df_cat, df_num], axis = 1, sort = False)
        X = X.drop([target], axis = 1)
    elif cat_features:
        X = pd.DataFrame(X_cat_enc)
    elif num_features:
        X = pd.DataFrame(X_num)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    
rf = RandomForestRegressor(n_estimators = 800, min_samples_split = 2, min_samples_leaf = 1, 
                            max_features = 'log2', max_depth = 70, bootstrap = False)
rf.fit(X_train, y_train)
# quantify quality of prediction
y_predict = rf.predict(X_test)
r_2_score = r2_score(y_test, y_predict)
rmsle = math.sqrt(mean_squared_log_error(y_test, y_predict))
ret_stmt = 'R^2 Score: ' + str(r_2_score) + '\n' + 'RMSLE: ' + str(rmsle)

print(ret_stmt)

FileNotFoundError: [Errno 2] File /Users/nguyentiendung/Desktop/Projects/api/data/miscellaneous/pure_num_data.csv does not exist: '/Users/nguyentiendung/Desktop/Projects/api/data/miscellaneous/pure_num_data.csv'

In [None]:
train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/titanic_train.csv"))

In [None]:
features = list(train_data)
X = train_data[features]
y = train_data['Survived']

In [None]:
# check for numeric features train_data
num_features = []
cat_features = []
for feature in X:
    if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
        num_features.append(feature)
    else:
        cat_features.append(feature)

In [None]:
X_new = pd.concat([X[cat_features], X['Survived']], axis = 1, sort = False)

In [None]:
X_new.to_json('data/miscellaneous/pure_cat_data.json')
X_new.to_csv('data/miscellaneous/pure_cat_data.csv')

In [2]:
tuple_example = (0, 1, 2, 3)
tuple_example[2:]

(2, 3)

In [45]:
with open('del_feature.pkl', 'rb') as file:
        del_feature = pickle.load(file)

In [46]:
del_feature

'Id'

In [1]:
lowest_score = min([54, 0.33])

In [2]:
lowest_score

0.33

In [19]:
train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/miscellaneous/train_Xcat_ynum.csv"), index_col = [0])

In [21]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 44 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       1460 non-null   object
 1   Street         1460 non-null   object
 2   Alley          91 non-null     object
 3   LotShape       1460 non-null   object
 4   LandContour    1460 non-null   object
 5   Utilities      1460 non-null   object
 6   LotConfig      1460 non-null   object
 7   LandSlope      1460 non-null   object
 8   Neighborhood   1460 non-null   object
 9   Condition1     1460 non-null   object
 10  Condition2     1460 non-null   object
 11  BldgType       1460 non-null   object
 12  HouseStyle     1460 non-null   object
 13  RoofStyle      1460 non-null   object
 14  RoofMatl       1460 non-null   object
 15  Exterior1st    1460 non-null   object
 16  Exterior2nd    1460 non-null   object
 17  MasVnrType     1452 non-null   object
 18  ExterQual      1460 non-null

In [31]:
X = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/miscellaneous/train_Xcat_ynum.csv"), index_col = [0])
# split features
num_features = []
cat_features = []
for feature in X:
    if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
        num_features.append(feature)
    else:
        cat_features.append(feature)

In [32]:
num_features

['SalePrice']

In [38]:
bag = [6]

In [39]:
if all(elem == 6 for elem in bag):
    print("hi")
else:
    print("yom")

hi


In [5]:
test = pd.DataFrame()

In [6]:
test

In [90]:
with open('fs_values.pkl', 'rb') as file:
    unpickler = pickle.Unpickler(file);
    fs_values = unpickler.load();
    if len(fs_values) == 5:
        print("hi")
        ordinal_feature, target = fs_values[3:] # regressor
    elif len(fs_values) == 4:
        ordinal_feature, target = fs_values[2:] # classifier

In [86]:
fs_values

[None, None, 'Pclass', 'Survived']

In [89]:
fs_values[2:]

['Pclass', 'Survived']