# Linear Regression

In [52]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import * 
from xgboost import XGBRegressor
from xgboost import XGBClassifier

def classification_metrics(Y_pred, Y_true):
    acc = accuracy_score(Y_true, Y_pred)
    precision = precision_score(Y_true, Y_pred)
    recall = recall_score(Y_true, Y_pred)
    f1score = f1_score(Y_true, Y_pred)
    auc = roc_auc_score(Y_true, Y_pred)

    return acc, precision, recall, f1score, auc

def display_metrics(classifierName, Y_pred, Y_true):
    print ("______________________________________________")
    print ("Model: "+classifierName)
    acc, precision, recall, f1score, auc = classification_metrics(Y_pred, Y_true)
    print ("Accuracy: "+str(acc))
    print ("Precision: "+str(precision))
    print ("Recall: "+str(recall))
    print ("F1-score: "+str(f1score))
    print ("AUC: "+str(auc))
    print ("______________________________________________")
    print ("")


### Xgboost Regressor

In [53]:
house = pd.read_csv('house.csv') 

house 


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,6.48,22.0


In [54]:
X = house[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']]
# X = house[['ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']]
# X = house[['CRIM', 'ZN', 'INDUS', 'CHAS']]
y = house['MEDV'] 


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [56]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # numpy func np.sqrt takes sq root


MAE: 1.873616880529067
MSE: 6.918145463304386
RMSE: 2.6302367694381403


### Xgboost Classifier

In [57]:
ad_data = pd.read_csv('advertising.csv')

ad_data


Unnamed: 0,Daily Time Spent on Site,Age,Income,Daily Internet Usage,Ad Topic Line,Male,Country,Timestamp,Clicked on Ad
0,82.03,41,71511.08,187.53,Intuitive dynamic attitude,0,Afghanistan,5/2/2016 7:00,0
1,80.03,44,24030.06,150.84,Automated static concept,0,Afghanistan,7/23/2016 14:47,1
2,51.38,59,42362.49,158.56,Object-based modular functionalities,0,Afghanistan,6/17/2016 17:11,1
3,77.07,40,44559.43,261.02,Face-to-face analyzing encryption,0,Afghanistan,3/1/2016 10:01,0
4,51.87,50,51869.87,119.65,Team-oriented dynamic forecast,0,Afghanistan,3/10/2016 22:28,1
...,...,...,...,...,...,...,...,...,...
995,67.35,29,47510.42,118.69,Sharable secondary Graphical User Interface,0,Zimbabwe,4/28/2016 21:58,1
996,56.64,29,55984.89,123.24,Versatile 6thgeneration parallelism,1,Zimbabwe,2/27/2016 8:52,1
997,36.91,48,54645.20,159.69,Ameliorated coherent open architecture,0,Zimbabwe,2/24/2016 7:13,1
998,81.56,26,51363.16,213.70,Open-source scalable protocol,1,Zimbabwe,7/13/2016 21:31,0


In [58]:
ad_data2 = ad_data.copy()

ad_data2 = pd.concat([ad_data2, pd.get_dummies(ad_data2['Country'], prefix='Country', drop_first = True)],axis=1)

ad_data2.drop(['Country'],axis=1, inplace=True)

predictors1 = ['Age']

predictors2 = ['Age']+[i for i in ad_data2.columns if i.startswith('Country')]

predictors3 = ['Age', 'Male', 'Daily Time Spent on Site', 'Daily Internet Usage', 'Income']


In [62]:
X = ad_data2[predictors3]

y = ad_data2['Clicked on Ad'] 


In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [64]:
confusion_matrix_results = confusion_matrix(y_test, y_pred)

print('confusion matrix: \n', confusion_matrix_results)

display_metrics('Xgboost', y_pred, y_test)


confusion matrix: 
 [[97  4]
 [ 5 94]]
______________________________________________
Model: Xgboost
Accuracy: 0.955
Precision: 0.9591836734693877
Recall: 0.9494949494949495
F1-score: 0.9543147208121828
AUC: 0.9549454945494548
______________________________________________

