

```
# This is formatted as code
```

**Import training set and test set into dataframe**

In [23]:
import pandas
import numpy as np
train_transaction = pandas.read_csv('Dataset/train_transaction.csv')
train_identity = pandas.read_csv('Dataset/train_identity.csv')
test_transaction = pandas.read_csv('Dataset/test_transaction.csv')
test_identity = pandas.read_csv('Dataset/test_identity.csv')
train_set = train_transaction.merge(train_identity,on='TransactionID',how='left')
test_set = test_transaction.merge(test_identity,on='TransactionID',how='left')
train_set.append(test_set)
df = train_set
df.describe().transpose()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TransactionID,590540.0,3.282270e+06,1.704744e+05,2987000.000,3134634.750,3282269.500,3429904.25,3.577539e+06
isFraud,590540.0,3.499001e-02,1.837546e-01,0.000,0.000,0.000,0.00,1.000000e+00
TransactionDT,590540.0,7.372311e+06,4.617224e+06,86400.000,3027057.750,7306527.500,11246620.00,1.581113e+07
TransactionAmt,590540.0,1.350272e+02,2.391625e+02,0.251,43.321,68.769,125.00,3.193739e+04
card1,590540.0,9.898735e+03,4.901170e+03,1000.000,6019.000,9678.000,14184.00,1.839600e+04
card2,581607.0,3.625555e+02,1.577932e+02,100.000,214.000,361.000,512.00,6.000000e+02
card3,588975.0,1.531949e+02,1.133644e+01,100.000,150.000,150.000,150.00,2.310000e+02
card5,586281.0,1.992789e+02,4.124445e+01,100.000,166.000,226.000,226.00,2.370000e+02
addr1,524834.0,2.907338e+02,1.017411e+02,100.000,204.000,299.000,330.00,5.400000e+02
addr2,524834.0,8.680063e+01,2.690623e+00,10.000,87.000,87.000,87.00,1.020000e+02


**Function to reduce memoery**

In [2]:
def reduce_mem(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

**Applying memory reduction to our dataset**

In [3]:
reduce_mem(df)

Mem. usage decreased from 1959.88 Mb to 650.48 Mb (66.8% reduction)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.500000,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.000000,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.000000,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.000000,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.000000,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
5,2987005,0,86510,49.000000,W,5937,555.0,150.0,visa,226.0,...,,,,,,,,,,
6,2987006,0,86522,159.000000,W,12308,360.0,150.0,visa,166.0,...,,,,,,,,,,
7,2987007,0,86529,422.500000,W,12695,490.0,150.0,visa,226.0,...,,,,,,,,,,
8,2987008,0,86535,15.000000,H,2803,100.0,150.0,visa,226.0,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
9,2987009,0,86536,117.000000,W,17399,111.0,150.0,mastercard,224.0,...,,,,,,,,,,


In [4]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


**Replace all null values**

In [5]:
df.isnull().any().sum()

414

In [6]:
df = df.fillna(0)
df.isnull().any().sum()

0

**Transform categorical features to numerical**

In [7]:
categorical_data = df.select_dtypes(include=[object])

In [8]:
print(categorical_data.head())

  ProductCD       card4   card6 P_emaildomain R_emaildomain M1 M2 M3  M4 M5  \
0         W    discover  credit             0             0  T  T  T  M2  F   
1         W  mastercard  credit     gmail.com             0  0  0  0  M0  T   
2         W        visa   debit   outlook.com             0  T  T  T  M0  F   
3         W  mastercard   debit     yahoo.com             0  0  0  0  M0  T   
4         H  mastercard  credit     gmail.com             0  0  0  0   0  0   

   ...        id_30                id_31      id_33           id_34 id_35  \
0  ...            0                    0          0               0     0   
1  ...            0                    0          0               0     0   
2  ...            0                    0          0               0     0   
3  ...            0                    0          0               0     0   
4  ...  Android 7.0  samsung browser 6.2  2220x1080  match_status:2     T   

  id_36 id_37 id_38 DeviceType                     DeviceInfo 

In [9]:
df = df.drop(categorical_data.columns,axis = 1)

In [10]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
0,2987000,0,86400,68.5,13926,0.0,150.0,142.0,315.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,...,166.0,0.0,542.0,144.0,0.0,0.0,0.0,0.0,0.0,32.0


In [11]:
from sklearn import preprocessing
encoding = preprocessing.OneHotEncoder()
categorical_data.replace(0, "0", inplace=True)

le = preprocessing.LabelEncoder()
le_cat_data = categorical_data.apply(le.fit_transform)

le_cat_data.head()

Unnamed: 0,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,...,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,4,2,2,0,0,2,2,2,3,1,...,0,0,0,0,0,0,0,0,0,0
1,4,3,2,17,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
2,4,4,3,36,0,2,2,2,1,1,...,0,0,0,0,0,0,0,0,0,0
3,4,3,3,54,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
4,1,3,2,17,0,0,0,0,0,0,...,8,124,165,4,2,1,2,2,2,955


In [12]:
df = pandas.concat([df, le_cat_data], axis=1)

**Standardize some of our features**

In [13]:

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

df['amount'] = StandardScaler().fit_transform(df['TransactionAmt'].values.reshape(-1,1))
df['time'] = StandardScaler().fit_transform(df['TransactionDT'].values.reshape(-1,1))

df = df.drop(['TransactionDT','TransactionAmt'], axis = 1)
df.head()

  return umr_sum(a, axis, dtype, out, keepdims, initial)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return umr_sum(a, axis, dtype, out, keepdims, initial)
  X /= self.scale_


Unnamed: 0,TransactionID,isFraud,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,amount,time
0,2987000,0,13926,0.0,150.0,142.0,315.0,87.0,19.0,0.0,...,0,0,0,0,0,0,0,0,,-1.577987
1,2987001,0,2755,404.0,150.0,102.0,325.0,87.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,-1.577986
2,2987002,0,4663,490.0,150.0,166.0,330.0,87.0,287.0,0.0,...,0,0,0,0,0,0,0,0,,-1.577972
3,2987003,0,18132,567.0,150.0,117.0,476.0,87.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,-1.577965
4,2987004,0,4497,514.0,150.0,102.0,420.0,87.0,0.0,0.0,...,165,4,2,1,2,2,2,955,,-1.577964


In [20]:
X = df.drop(['isFraud'], axis = 1)
Y = df['isFraud']

**Applying PCA to get all features**

In [21]:
pca = PCA(n_components=432)
pComponents = pca.fit_transform(X.values)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [0]:
eigenvalues = pca.explained_variance_
total_sum = eigenvalues.sum()

Calculating the top 50 PCA to understand the transformed features

In [0]:
for i in range(1, 50):
  total = eigenvalues[:i].sum()
  variance = total/total_sum
  print('total variance for {} PCA is : {}'.format(i, variance))

total variance for 1 PCA is : 0.6629360723761234
total variance for 2 PCA is : 0.9264209258326431
total variance for 3 PCA is : 0.9941266719977593
total variance for 4 PCA is : 0.9989472836005472
total variance for 5 PCA is : 0.9994913388371168
total variance for 6 PCA is : 0.9996313547957322
total variance for 7 PCA is : 0.9997203894034415
total variance for 8 PCA is : 0.9997873776749229
total variance for 9 PCA is : 0.9998431481518449
total variance for 10 PCA is : 0.9998898766453447
total variance for 11 PCA is : 0.9999155475881404
total variance for 12 PCA is : 0.9999290058566933
total variance for 13 PCA is : 0.9999395991924115
total variance for 14 PCA is : 0.9999474499751945
total variance for 15 PCA is : 0.9999540350811208
total variance for 16 PCA is : 0.9999599556503607
total variance for 17 PCA is : 0.9999655278624116
total variance for 18 PCA is : 0.9999698340001026
total variance for 19 PCA is : 0.9999733693783566
total variance for 20 PCA is : 0.9999759972513599
total var

The top 2 elements represent for 99% of data. 

In order to test our hypothesis, applied 30 pca to our dataset.

In [0]:
pca = PCA(n_components=30)
pComponents = pca.fit_transform(X.values)
pDf = pandas.DataFrame(data = pComponents
             , columns = ['pc_1', 'pc_2', 'pc_3', 'pc_4', 'pc_5','pc_6', 'pc_7','pc_8', 'pc_9', 'pc_10', 'pc_11', 'pc_12', 'pc_13', 'pc_14', 'pc_15', 'pc_16', 'pc_17', 'pc_18', 'pc_19', 'pc_20', 'pc_21', 'pc_22', 'pc_23', 'pc_24', 'pc_25', 'pc_26', 'pc_27', 'pc_28', 'pc_29', 'pca_30'])
finalDf = pandas.concat([pDf, Y], axis = 1)
finalDf.head()

Unnamed: 0,TransactionID,card1,card2,card3,card5,addr1,addr2,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,...,id_21,id_22,id_24,id_25,id_26,id_32,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,M6,M7,M8,M9,id_12,id_15,id_16,id_23,id_27,id_28,id_29,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,amount,time,isFraud
0,2987000,13926,0.0,150.0,142.0,315.0,87.0,19.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,2,2,0,0,2,2,2,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577987,0
1,2987001,2755,404.0,150.0,102.0,325.0,87.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,3,2,17,0,0,0,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577986,0
2,2987002,4663,490.0,150.0,166.0,330.0,87.0,287.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,315.0,0.0,0.0,0.0,315.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,4,3,36,0,2,2,2,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577972,0
3,2987003,18132,567.0,150.0,117.0,476.0,87.0,0.0,0.0,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,0.0,0.0,0.0,0.0,84.0,0.0,0.0,0.0,0.0,111.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4,3,3,54,0,0,0,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0,-1.577965,0
4,2987004,4497,514.0,150.0,102.0,420.0,87.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,32.0,1,3,2,17,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,2,2,8,124,165,4,2,1,2,2,2,955,-0.0,-1.577964,0


**Visualizing the distribution of top 10 pca**

In [0]:
import matplotlib.gridspec as gridspec
from matplotlib.pyplot import figure
from matplotlib import pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D 
plt.figure(figsize=(12,28*4))
plt.style.use('ggplot')
final_features = finalDf.iloc[:,0:10].columns
grid = gridspec.GridSpec(10, 1)
for i, cn in enumerate(final_features):
    axis = plt.subplot(grid[i])
    sns.distplot(finalDf[cn][finalDf.isFraud == 1], bins=50)
    sns.distplot(finalDf[cn][finalDf.isFraud == 0], bins=50)
    axis.set_xlabel('histogram of feature: ' + str(cn))
plt.show()

In [18]:
#Resampling
finalDf.shape
finalDf = pandas.concat([X, Y], axis = 1)
finalDf.head()

NameError: name 'finalDf' is not defined

**Splitting the data into test set and train set**

In [1]:
finalDf.describe().transpose()

NameError: ignored

In [0]:
finalDf_copy = finalDf.copy()
#Spliting the data set into train and test set.

train_set = finalDf_copy.sample(frac=0.80, random_state=0)
test_set = finalDf_copy.drop(train_set.index)
train_set.shape

In [0]:
#Class Count after resampling
class_count_resample=train_set['isFraud'].value_counts().values
print(class_count_resample)
sns.barplot(['Genuine','Fraud'],class_count_resample)

In [0]:
del finalDf_copy
del finalDf

In [0]:
#Now we have traing set lets count how many samples of each class do we have
df_isFraud = train_set[train_set.isFraud == 1]
df_notFraud = train_set[train_set.isFraud == 0]

In [0]:
df_isFraud.shape


In [0]:
df_notFraud.shape

**Down Sampling**

In [22]:
from sklearn.utils import resample
df_downsampled = resample(df_notFraud,
                                      replace=False,
                                      n_samples=df_isFraud.shape[0])    # to match minority class
print("After Down Sampling:\n",df_downsampled.shape,df_isFraud.shape)

NameError: name 'df_notFraud' is not defined

**Upsampling**

In [0]:
from sklearn.utils import resample
df_upsampled = resample(df_isFraud, replace=True, n_samples=df_notFraud.shape[0], random_state=12 )    # to match majority class
print("After Up Sampling:\n",df_upsampled.shape,df_isFraud.shape)

In [0]:
from sklearn.utils import shuffle
df_train = shuffle(pandas.concat([df_downsampled,df_isFraud]))

In [0]:
df_train_up = shuffle(pandas.concat([df_upsampled,df_notFraud]))

In [0]:
#Class Count after resampling
class_count_resample=df_train['isFraud'].value_counts().values
print(class_count_resample)
sns.barplot(['Genuine','Fraud'],class_count_resample)

In [0]:
#Class Count after resampling
class_count_resample=df_train_up['isFraud'].value_counts().values
print(class_count_resample)
sns.barplot(['Genuine','Fraud'],class_count_resample)

In [0]:
df_train

In [0]:
#Now we have the balanced traing data set we can 
#Divide the data set into X and y
y_train = df_train.isFraud
X_train = df_train.drop('isFraud',axis=1)

y_test = test_set.isFraud
X_test = test_set.drop('isFraud',axis=1)


**Using Logistic Regression**
traning our Model

In [0]:

from sklearn.linear_model import LogisticRegression as logReg
model_LR = logReg().fit(X_train,y_train)

In [0]:
prediction_LR = model_LR.predict(X_test)

from sklearn.metrics import accuracy_score

#Testing the model for accuracy on test data set
print(accuracy_score(y_test,prediction_LR))

In [0]:
from sklearn import metrics
confusion_matrix_LR = metrics.confusion_matrix(y_test, prediction_LR)
confusion_matrix_LR

In [0]:
print("Accuracy:",metrics.accuracy_score(y_test, prediction_LR))
print("Precision:",metrics.precision_score(y_test, prediction_LR))
print("Recall:",metrics.recall_score(y_test, prediction_LR))

**Using Naive Bayes**
traning our Model

In [0]:

from sklearn.naive_bayes import GaussianNB
model_NB = GaussianNB().fit(X_train,y_train)
prediction_NB = model_NB.predict(X_test)
print(accuracy_score(y_test,prediction_NB))

**Random Forest**

In [0]:
from sklearn.ensemble import RandomForestRegressor

In [0]:
model_RF = RandomForestRegressor(
    n_estimators=400, max_features=0.3,
    min_samples_leaf=20, n_jobs=-1, verbose=1)

In [0]:
model_RF.fit(X_train,y_train)

In [0]:
prediction_RF = model_RF.predict(X_test)

In [0]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, prediction_RF)

**SVM**

In [0]:
from sklearn.svm import LinearSVC
model_svm = LinearSVC()
model_svm.fit(X_train, y_train)

In [0]:
predict_SVM = model_svm.predict(X_test)

**KMeans**

In [0]:
from sklearn.cluster import KMeans

# Number of clusters
kmeans = KMeans(n_clusters=2)
# Fitting the input data
kmeans = kmeans.fit(X_train)
# Getting the cluster labels
predict_kmeans = kmeans.predict(X_test)
# Centroid values
centroids = kmeans.cluster_centers_

print(centroids) # From sci-kit learn

In [0]:
print(accuracy_score(y_test,predict_kmeans ))

In [24]:
import numpy as np
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
lw = 2
# logistic regression
FPR_LR,TPR_LR,th_LR = metrics.roc_curve(y_test,prediction_LR)
area_LR = roc_auc_score(y_test,prediction_LR)
# Naives Byaes
FPR_NB,TPR_NB,th_NB = metrics.roc_curve(y_test,prediction_NB)
area_NB = roc_auc_score(y_test,prediction_NB)
# Randon Forest
FPR_RF,TPR_RF,th_RF = metrics.roc_curve(y_test,prediction_RF)
area_RF = roc_auc_score(y_test,prediction_RF)
# SVM
FPR_SVM,TPR_SVM,th_SVM = metrics.roc_curve(y_test,predict_SVM)
area_SVM = roc_auc_score(y_test,predict_SVM)
# Kmeans
FPR_kmeans,TPR_kmeans,th_kmeans = metrics.roc_curve(y_test,predict_kmeans)
area_kmeans = roc_auc_score(y_test,predict_kmeans)


plt.plot(FPR_RF,TPR_RF,lw=lw, label='Randon Forest (area = %0.2f)' % area_RF)
plt.plot(FPR_LR,TPR_LR,lw=lw, label='Logistic Regression (area = %0.2f)' % area_LR)
plt.plot(FPR_NB,TPR_NB,lw=lw, label='Naives Bayes (area = %0.2f)' % area_NB)
plt.plot(FPR_SVM,TPR_SVM,lw=lw, label='SVM (area = %0.2f)' % area_SVM)
plt.plot(FPR_kmeans,TPR_kmeans,lw=lw, label='K-means (area = %0.2f)' % area_kmeans)





# plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC plot for different Algorithms')
plt.legend(loc="lower right")
plt.show()



NameError: name 'y_test' is not defined