In [None]:
conda install graphviz

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import datasets,tree
from sklearn.tree import export_graphviz 
from sklearn import externals
from io import StringIO
import pydotplus
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.image as mpimg 
import matplotlib.pyplot as plt

## __1 - Business Problem__  
___Use Random Forest to prepare a model on fraud data___  

## __2 - Data collection and description__ 

In [None]:
df = pd.read_csv("../DATASCIENCE/Fraud_check.csv")

In [None]:
df1 = df.copy()

In [None]:
df1.head()

In [None]:
df1.describe().T

In [None]:
df1.isnull().sum()

In [None]:
df1.dtypes

### Outlier Check

In [None]:
ax = sns.boxplot(df1['Taxable.Income'])

### There are no outliers in the data

In [None]:
plt.rcParams["figure.figsize"] = 9,5

In [None]:
plt.figure(figsize=(16,5))
print("Skew: {}".format(df1['Taxable.Income'].skew()))
print("Kurtosis: {}".format(df1['Taxable.Income'].kurtosis()))
ax = sns.kdeplot(df1['Taxable.Income'],shade=True,color='g')
plt.xticks([i for i in range(10000,100000,10000)])
plt.show()

### The data is Skwed on the right  
### The data has negative Kurtosis

In [None]:
obj_colum = df1.select_dtypes(include='object').columns.tolist()

In [None]:
plt.figure(figsize=(16,10))
for i,col in enumerate(obj_colum,1):
    plt.subplot(2,2,i)
    sns.countplot(data=df1,y=col)
    plt.subplot(2,2,i+1)
    df1[col].value_counts(normalize=True).plot.bar()
    plt.ylabel(col)
    plt.xlabel('% distribution per category')
plt.tight_layout()
plt.show()  

In [None]:
num_columns = df1.select_dtypes(exclude='object').columns.tolist()

In [None]:
plt.figure(figsize=(18,40))
for i,col in enumerate(num_columns,1):
    plt.subplot(8,4,i)
    sns.kdeplot(df[col],color='g',shade=True)
    plt.subplot(8,4,i+10)
    df[col].plot.box()
plt.tight_layout() 
plt.show()
num_data = df[num_columns]
pd.DataFrame(data=[num_data.skew(),num_data.kurtosis()],index=['skewness','kurtosis'])

In [None]:
df1 = pd.get_dummies(df1, columns = ['Undergrad','Marital.Status','Urban'])

In [None]:
corr = df1.corr()

In [None]:
corr = df1.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(corr,annot=True)

## __3 - Random Forest Model__ 

### Since the target variable is continious, we create a class of taxable_income <= 30000 as "Risky" and others are "Good"   

In [None]:
df1['Taxable.Income']=pd.cut(df1['Taxable.Income'],bins=[0,30000,100000],labels=['risky','good'])

In [None]:
list(df1.columns)

In [None]:
X = df1.iloc[:,1:10]
y = df1.iloc[:,0]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [None]:
y_train.value_counts()

In [None]:
model =RF(n_jobs=4,n_estimators = 150, oob_score =True,criterion ='entropy') 
model.fit(x_train,y_train)
model.oob_score_

In [None]:
pred_train = model.predict(x_train)

In [None]:
accuracy_score(y_train,pred_train)

In [None]:
confusion_matrix(y_train,pred_train)

In [None]:
pred_test = model.predict(x_test)

In [None]:
accuracy_score(y_test,pred_test)

In [None]:
confusion_matrix(y_test,pred_test)

In [None]:
df_t=pd.DataFrame({'Actual':y_test, 'Predicted':pred_test})

In [None]:
df_t

In [None]:
cols = list(df1.columns)

In [None]:
predictors = cols[1:10]
target = cols[0]

In [None]:
tree1 = model.estimators_[20]

In [None]:
dot_data = StringIO()

In [None]:
export_graphviz(tree1, out_file = dot_data, feature_names =predictors, class_names = target, filled =True,rounded=True,impurity =False,proportion=False,precision =2)

In [None]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

In [None]:
graph.write_png('fraud_full.png')

## __4 - Conclusion__ 

### Since the accuracy of the Training set is 100% we test the accurancy on the test data which is 72%  
### As seen in the confusion matrix of Test data 94 instances are presdected correctly and 26 instances are not

In [None]:
rf_small = RF(n_estimators=10, max_depth = 3)

In [None]:
rf_small.fit(x_train,y_train)

In [None]:
tree_small = rf_small.estimators_[5]

In [None]:
export_graphviz(tree_small, out_file = dot_data, feature_names = predictors, rounded = True, precision = 1)

In [None]:
graph_small = pydotplus.graph_from_dot_data(dot_data.getvalue())

In [None]:
graph.write_png('fraud_small.png')

In [None]:
img = mpimg.imread('fraud_small.png') 

In [None]:
plt.imshow(img)

In [None]:
model.feature_importances_

In [None]:
fi = pd.DataFrame({'feature': list(x_train.columns),
                   'importance': model.feature_importances_}).\
                    sort_values('importance', ascending = False)

In [None]:
fi

### As seen in the above table city population is most important feature