# Support Vector Machine Model

## Dataset
This dataset usage for Kaggle InClass Competition Telecom Churn Analytics.

The dataset used of this coding,you can downloading in this link.

https://www.kaggle.com/c/churn-analytics-bda

###### Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc
import warnings
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import cross_validation,metrics
from sklearn.metrics import classification_report, confusion_matrix,roc_curve,auc
from IPython.display import Image
from IPython.core.display import HTML 

warnings.simplefilter("ignore")
%matplotlib inline

In [None]:
train = pd.read_csv('Datasets/churnTrain.csv')
test = pd.read_csv('Datasets/churnTest.csv')

In [None]:
# Adiciona coluna Churn no dataset de teste para poder fazer merge entre os dois.
test['Churn'] = 'TESTE'
dataframes = [train, test]
churnData = pd.concat(dataframes) 

#### Feature Engineering

N/A treatment, applied mean values in N/A columns.

In [None]:
churnData['Voice_Mail_Plan'].fillna(' no',inplace=True)
churnData['International_Plan'].fillna(' no',inplace=True)
churnData['Account_Length'].fillna(churnData['Account_Length'].mean(),inplace=True)
churnData['Total_Night_Minutes'].fillna(churnData['Total_Night_Minutes'].mean(),inplace=True)
churnData['Total_Intl_Calls'].fillna(churnData['Total_Intl_Calls'].mean(),inplace=True)
churnData['Total_Night_Charge'].fillna(churnData['Total_Night_Charge'].mean(),inplace=True)
churnData['Total_Night_Calls'].fillna(churnData['Total_Night_Calls'].mean(),inplace=True)
churnData['Total_Night_Minutes'].fillna(churnData['Total_Night_Minutes'].mean(),inplace=True)
churnData['Total_Eve_Charge'].fillna(churnData['Total_Eve_Charge'].mean(),inplace=True)
churnData['Total_Eve_Calls'].fillna(churnData['Total_Eve_Calls'].mean(),inplace=True)
churnData['Total_Eve_Minutes'].fillna(churnData['Total_Eve_Minutes'].mean(),inplace=True)
churnData['Total_Day_charge'].fillna(churnData['Total_Day_charge'].mean(),inplace=True)
churnData['Total_Day_Calls'].fillna(churnData['Total_Day_Calls'].mean(),inplace=True)
churnData['Total_Day_minutes'].fillna(churnData['Total_Day_minutes'].mean(),inplace=True)

In [None]:
churnData['Churn'] = churnData['Churn'].astype('category')
churnData['State'] = churnData['State'].astype('category')
churnData['Voice_Mail_Plan'] = churnData['Voice_Mail_Plan'].astype('category')
churnData['International_Plan'] = churnData['International_Plan'].astype('category')

churnData['Churn'] = churnData['Churn'].cat.codes
churnData['State_cat'] = churnData['State'].cat.codes

##### One Hot Enconding - OHE

In [None]:
cols_to_transform = ['International_Plan','Voice_Mail_Plan']
churnData = pd.get_dummies(churnData, columns=cols_to_transform)

##### Train and test Split

In [None]:
## Train and test separated again
churnData = churnData[['Account_Length','Area_Code','Phone_No','No_Vmail_Messages','Total_Day_minutes','Total_Day_Calls',
                      'Total_Day_charge','Total_Night_Calls','Total_Night_Charge','Total_Intl_Minutes','Total_Intl_Calls',
                      'Total_Intl_Charge','No_CS_Calls','State_cat','Voice_Mail_Plan_ no','Voice_Mail_Plan_ yes',
                       'International_Plan_ no','International_Plan_ yes','Churn']]


churnTrain = churnData.query('Churn!=2')
churnTest = churnData.query('Churn ==2')
churnTest.drop('Churn', axis=1)



Y = pd.factorize(churnTrain['Churn'].values)[0].reshape(-1,1)
X = churnTrain.drop('Churn',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.33, random_state=42)

In [None]:
churnData.columns = ['Account_Length','Area_Code','Phone_No','No_Vmail_Messages','Total_Day_minutes','Total_Day_Calls',
                    'Total_Day_charge','Total_Night_Calls','Total_Night_Charge','Total_Intl_Minutes','Total_Intl_Calls',
                    'Total_Intl_Charge','No_CS_Calls','State_cat','Voice_Mail_Plan_no','Voice_Mail_Plan_yes',
                    'International_Plan_no','International_Plan_yes','Churn']

In [None]:
model_svm = SVC()

In [None]:
model_svm.fit(X_train,y_train)

In [None]:
model_svm.score(X_test,y_test)

In [None]:
y_pred = model_svm.predict(X_test)

##### Confusion Matrix 

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

##### ROC Curve

In [None]:
false_positive_rate, true_positive_rate,thresholds = roc_curve(y_pred, y_test)
roc_auc = auc(false_positive_rate,true_positive_rate)

In [None]:
plt.title('Roc Curve SVM Model')
plt.plot(false_positive_rate,true_positive_rate,'b', label='AUC = %0.2f'%roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.show()

##### Feature Importance in this model

In [None]:
importances = pd.DataFrame({'feature':X.columns,'importance':np.round(model_svm.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.plot.bar()

In [None]:
feat_importances = pd.Series(model_svm.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')