In [64]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, balanced_accuracy_score, r2_score, mean_squared_log_error
from pandas.plotting import scatter_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.feature_selection import SelectKBest, f_regression, chi2

In [75]:
train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/house-prices-advanced-regression-techniques/" + 
                                            "train.csv"))

In [76]:
features = list(train_data)
X = train_data[features]
y = train_data['SalePrice']

In [72]:
# check for numeric features train_data
num_features = []
cat_features = []
for feature in X:
    if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
        num_features.append(feature)
    else:
        cat_features.append(feature)

In [18]:
# X_new = pd.concat([X[num_features], X['SalePrice']], axis = 1, sort = False)

In [77]:
# X_new.info()
X = X[num_features]
# X = X.drop('SalePrice', axis = 1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

In [78]:
# X_new.to_json('data/miscellaneous/Xcat_ynum.json')
# X_new.to_csv('data/miscellaneous/Xcat_ynum.csv')
X.to_json('data/miscellaneous/train_Xnum_ynum.json')
X_new.to_csv('data/miscellaneous/train_Xnum_ynum.csv')

In [30]:
cat_features = []
if cat_features:
    print("hi")
else:
    print("yom")

yom


In [39]:
# check for numeric features train_data
train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/miscellaneous/pure_num_data.csv"))
target = 'SalePrice'
ordinal_feature = 'OverallQual'

features = list(train_data)
X = train_data[features]
y = train_data[target]

num_features = []
cat_features = []
for feature in X:
    if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
        num_features.append(feature)
    else:
        cat_features.append(feature)

X = X.reset_index()
# if num_features is not empty
# impute using only numerical features
if num_features:
    imp = IterativeImputer(max_iter = 10, random_state = 42)
    imp.fit(X[num_features])
    X[num_features] = imp.transform(X[num_features])
    X_num = X.drop(cat_features, axis = 1)

# if cat_features is not empty
# impute using only categorical features
if cat_features:
    imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    X[cat_features] = imp.fit_transform(X[cat_features].astype(str))
    X_cat = X.drop(num_features, axis = 1)

# saleprice correlation matrix
k_num = round(len(X_num.columns) / 2)
corrmat = X_num.corr()
X_num_fs = corrmat.nlargest(k_num, target)[target].index

# check for multicollinearity
# if two features are strongly correlated with each other (>= 0.7) 
# the feature with the lower correlation with the target variable is dropped
multicorr = {}
k = len(corrmat)
for feature in corrmat:
    i = 1
    if feature != target:
        while i < k - 1:
            if corrmat[feature][i] >= 0.7 and feature != corrmat.index[i]:
                multicorr[feature] = corrmat.index[i], corrmat[feature][i]
            i = i + 1
    
# delete duplicates
corr_scores = []
for feature in list(multicorr.keys()):
    if multicorr[feature][1] in corr_scores:
        del multicorr[feature]
    else:
        corr_scores.append(multicorr[feature][1])
        
# remove the feature with the lower correlation coefficient (pearson)
dropped_features = [] 
for feature1, feature2 in multicorr.items():
    if corrmat[target][feature1] < corrmat[target][feature2[0]]:
        dropped_features.append(feature1)
    else:
        dropped_features.append(feature2[0])

# drop the features from X_num dataframe
for feature in X_num:
    if feature in dropped_features:
        X_num = X_num.drop(feature, axis = 1) 
X_num.drop(X_num.columns.difference(X_num_fs), 1, inplace = True)

drop_multicoll_features = dropped_features
drop_corr_features = X_num.columns.difference(X_num_fs)

# encode ordinal features (dummy variables)
ord_data = [ordinal_feature]
X_num = pd.get_dummies(X_num, columns = ord_data, drop_first = True)

# if cat_features is not empty
# encode categorical features
if cat_features:
    enc = OrdinalEncoder()
    enc.fit(X_cat)
    X_cat_enc = enc.transform(X_cat)
    
    # feature selection on categorical data
    k_cat = round(len(X_cat.columns) / 2)
    fs = SelectKBest(f_regression, k_cat)
    fs.fit(X_cat_enc, y) # save!!
    X_cat_fs = fs.transform(X_cat_enc)
    X_cat_enc = pd.DataFrame(X_cat_fs)
    
    # if cat_features and num_features are not empty
    # concatenate numerical and categorical features
    if cat_features and num_features:
        df_cat = pd.DataFrame(X_cat_enc, index = list(range(len(X.index))))
        df_num = pd.DataFrame(X_num, index = list(range(len(X.index))))
        X = pd.concat([df_cat, df_num], axis = 1, sort = False)
        X = X.drop([target], axis = 1)
    elif cat_features:
        X = pd.DataFrame(X_cat_enc)
    elif num_features:
        X = pd.DataFrame(X_num)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    
rf = RandomForestRegressor(n_estimators = 800, min_samples_split = 2, min_samples_leaf = 1, 
                            max_features = 'log2', max_depth = 70, bootstrap = False)
rf.fit(X_train, y_train)
# quantify quality of prediction
y_predict = rf.predict(X_test)
r_2_score = r2_score(y_test, y_predict)
rmsle = math.sqrt(mean_squared_log_error(y_test, y_predict))
ret_stmt = 'R^2 Score: ' + str(r_2_score) + '\n' + 'RMSLE: ' + str(rmsle)

print(ret_stmt)

R^2 Score: 0.9524449313129651
RMSLE: 0.09987353221616248


In [3]:
train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/titanic_train.csv"))

In [5]:
features = list(train_data)
X = train_data[features]
y = train_data['Survived']

In [6]:
# check for numeric features train_data
num_features = []
cat_features = []
for feature in X:
    if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
        num_features.append(feature)
    else:
        cat_features.append(feature)

In [7]:
X_new = pd.concat([X[cat_features], X['Survived']], axis = 1, sort = False)

In [8]:
X_new.to_json('data/miscellaneous/pure_cat_data.json')
X_new.to_csv('data/miscellaneous/pure_cat_data.csv')

In [48]:
tuple_example = (0, 1, 2, 3, 4)
tuple_example[3:]

(3, 4)