In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

features_path = "acsincome_ca_features.csv"
label_path = "acsincome_ca_labels.csv"

X = pd.read_csv(features_path, index_col=False, sep=",")
y = pd.read_csv(label_path, index_col = False, sep=",")

In [2]:
X, y = shuffle(X, y, random_state=42)

print(X.shape)
print(y.shape)
# only use the first N samples to limit training time
num_samples = int(len(X)*0.1)
X, y = X[:num_samples], y[:num_samples]

print(X.shape)
print(y.shape)

(195665, 10)
(195665, 1)
(19566, 10)
(19566, 1)


In [3]:
# Standrize the data
scaler = StandardScaler()
scaler.fit_transform(X)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [4]:
model_svm = SVR()
model_svm.fit(X_train,y_train.values.ravel())
print("train score SVM : ", model_svm.score(X_train,y_train))
print("test score SVM : ", model_svm.score(X_test,y_test))

train score SVM :  0.1012383507635225
test score SVM :  0.13233267613378885


In [5]:
model_randomF = RandomForestClassifier(max_depth=2, random_state=0)
model_randomF.fit(X_train,y_train.values.ravel())
print("train score random forest : ", model_randomF.score(X_train,y_train))
print("test score random forest : ", model_randomF.score(X_test,y_test))

train score random forest :  0.7745336059289548
test score random forest :  0.7835973428717424


In [6]:
model_adaBoost = AdaBoostClassifier(n_estimators=100, random_state=0)
model_adaBoost.fit(X_train,y_train.values.ravel())
print("train score Ada Boost : ", model_adaBoost.score(X_train,y_train))
print("test score Ada Boost : ", model_adaBoost.score(X_test,y_test))

train score Ada Boost :  0.81235624840276
test score Ada Boost :  0.815278487480838


In [7]:
model_GradientBoosting = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
model_GradientBoosting.fit(X_train,y_train.values.ravel())
print("train score Gradient Boosting : ", model_GradientBoosting.score(X_train,y_train))
print("test score Gradient Boosting : ", model_GradientBoosting.score(X_test,y_test))

train score Gradient Boosting :  0.8148479427549195
test score Gradient Boosting :  0.8155339805825242


In [8]:
print("SVM \n",cross_val_score(SVR(), X_train,y_train.values.ravel(),cv=5).mean())
print("Random Forest \n",cross_val_score(RandomForestClassifier(max_depth=2, random_state=0), X_train,y_train.values.ravel(),cv=5).mean())
print("Ada Boost \n",cross_val_score(AdaBoostClassifier(n_estimators=100, random_state=0), X_train,y_train.values.ravel(),cv=5).mean())
print("Gradient Boosting \n",cross_val_score(GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), X_train,y_train.values.ravel(),cv=5).mean())

SVM 
 0.09500540344935424
Random Forest 
 0.7747249549236074
Ada Boost 
 0.8097365008066302
Gradient Boosting 
 0.8075003239786002
