# Regression and classification

Please, don't try to use anything except sklearn for trivial problems!

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

### Loading data

In [None]:
crx_data = pd.read_csv('crx.data.txt', sep=',', header=None, na_values='?')

# name features
crx_data.columns = ['A'+str(i+1) for i in range(crx_data.shape[1])]
crx_data.head(10)

### Basic preprocessing

In [None]:
crx_data[['A11', 'A15']] = crx_data[['A11', 'A15']].astype(float)
crx_data.dropna(inplace=True)
crx_data.reset_index(drop=True, inplace=True)

X = crx_data.iloc[:, :-1]

y = crx_data.iloc[:, -1] # target variable


# processing categorical features (OneHotEnconding)
X = pd.get_dummies(X, prefix=[column for column in X.columns if X[column].dtype == object], 
                   drop_first=True)

# make train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=10)

# standartization
from sklearn.preprocessing import StandardScaler # remember what it makes?

scaler_train = StandardScaler().fit(X_train)
X_train = scaler_train.transform(X_train) 
X_test = scaler_train.transform(X_test)

## Random Forest

Please, read this article: https://medium.com/open-machine-learning-course/open-machine-learning-course-topic-5-ensembles-of-algorithms-and-random-forest-8e05246cbba7

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_features='sqrt', 
                                       n_jobs=4, random_state=0, bootstrap=True, max_depth=10, 
                                       min_samples_split=30)
random_forest.fit(X_train, y_train)
rf_predict = random_forest.predict(X_test)

# for i, c in enumerate(X.columns[:-1]):
#     print(c + ': ' + '%.4f' % random_forest.feature_importances_[i])

In [None]:
random_forest.estimators_[0]

## Gradient boosting

Please, read this article: https://medium.com/open-machine-learning-course/open-machine-learning-course-topic-10-gradient-boosting-c751538131ac

In [None]:
hostel_data = pd.read_csv("hostel_factors.csv")

In [None]:
features = {"f1":"Stuff",
"f2":"Hostel booking",
"f3":"Check-in & Check-out",
"f4":"Room condition",
"f5":"Kitchen condition",
"f6":"Global condition",
"f7":"Additional services",
"f8":"Facilities",
"f9":"Price/quality"}

In [None]:
X = hostel_data.drop(['hostel', 'rating'], axis=1)
y = hostel_data['rating']

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boosting = GradientBoostingRegressor(loss='huber', learning_rate=.01, n_estimators=1000, 
                                               max_depth=1, subsample=.6, max_features=None, random_state=0)

gradient_boosting.fit(X, y)

In [None]:
plt.plot(np.arange(len(gradient_boosting.oob_improvement_)), gradient_boosting.oob_improvement_)
plt.show()

In [None]:
plt.plot(np.arange(len(gradient_boosting.train_score_)), gradient_boosting.train_score_)
plt.show()

### How to measure quality?

In [None]:
# here we need to Google a little bit

### How to fit model in the best way?

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
# Initialize the validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create lists to save the values of accuracy on training and test sets
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
trees_grid = [5, 10, 15, 20, 30, 50, 75, 100]

# Train on the training set
for ntrees in trees_grid:
    rfc = RandomForestClassifier(n_estimators=ntrees, random_state=42, n_jobs=-1, oob_score=True)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        scaler_train = StandardScaler().fit(X_train)
        X_train = scaler_train.transform(X_train) 
        X_test = scaler_train.transform(X_test)
        y_train, y_test = y[train_index], y[test_index]
        rfc.fit(X_train, y_train)
        temp_train_acc.append(rfc.score(X_train, y_train))
        temp_test_acc.append(rfc.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)
    
train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} trees".format(max(test_acc.mean(axis=1))*100, 
                                                        trees_grid[np.argmax(test_acc.mean(axis=1))]))

In [None]:
plt.style.use('ggplot')

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(trees_grid, train_acc.mean(axis=1), alpha=0.5, color='blue', label='train')
ax.plot(trees_grid, test_acc.mean(axis=1), alpha=0.5, color='red', label='cv')
ax.fill_between(trees_grid, test_acc.mean(axis=1) - test_acc.std(axis=1), test_acc.mean(axis=1) + test_acc.std(axis=1), color='#888888', alpha=0.4)
ax.fill_between(trees_grid, test_acc.mean(axis=1) - 2*test_acc.std(axis=1), test_acc.mean(axis=1) + 2*test_acc.std(axis=1), color='#888888', alpha=0.2)
ax.legend(loc='best')
ax.set_ylim([0.8,1.02])
ax.set_ylabel("Accuracy")
ax.set_xlabel("N_estimators");

### What features are important?

In [None]:
importances = gradient_boosting.feature_importances_

indices = np.argsort(importances)[::-1]
# Plot the feature importancies of the forest
num_to_plot = 5
feature_indices = [ind+1 for ind in indices[:num_to_plot]]

# Print the feature ranking
print("Feature ranking:")

for f in range(num_to_plot):
    print("%d. %s %f " % (f + 1, 
            features["f"+str(feature_indices[f])], 
            importances[indices[f]]))
plt.figure(figsize=(15,5))
plt.title("Feature importances")
bars = plt.bar(range(num_to_plot), 
               importances[indices[:num_to_plot]],
       color=([str(i/float(num_to_plot+1)) 
               for i in range(num_to_plot)]),
               align="center")
ticks = plt.xticks(range(num_to_plot), 
                   feature_indices)
plt.xlim([-1, num_to_plot])
plt.legend(bars, [u''.join(features["f"+str(i)]) 
                  for i in feature_indices]);

### What to do next?

Of course unsupervised learning!