# Team 10
# Luis Yanchapanta, Andrew Chen, Andres Analuisa
## Project: Credit Card Fraud Detection
### The objective of the project is to gain insights from fradudulent credit card transactions. This project will be an attempt to identify trends within fraudulent transactions to better our understanding and answer questions such as where fraudulent transactions are used and who are more likely to be affected by fraud. Through this, we hope to find ways to improve and find new preventative measures against fraud
Data file: breast_cancer_diagnosis.csv

### Import libraries

In [157]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.impute import SimpleImputer

### Load data

In [186]:
# Read data from file (breast_cancer_diagnosis.csv) into dataframe
df = pd.read_csv('breast_cancer_diagnosis.csv', index_col= 1)


# Part 1: Web Scraping
## NYC Crime Data from "www.data.ny.gov"

### Examine data

In [187]:
# Review dataframe shape
df.shape

(569, 12)

In [188]:
# Display first 10 rows
df.head(10)

Unnamed: 0_level_0,id,radius,texture,perimeter,area,smoothness,compactness,concavity,symmetry,fractal_dimension,age,diagnosis
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Glynnis Munson,ID842302,,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.2419,0.07871,35,1
Lana Behrer,ID842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.1812,0.05667,27,1
Devondra Vanvalkenburgh,ID84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.2069,0.05999,31,1
Glory Maravalle,ID84348301,,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.2597,0.09744,49,1
Mellie Mccurdy,ID84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1809,0.05883,20,1
Merle Yelon,ID843786,,15.7,82.57,477.1,0.1278,0.17,0.1578,0.2087,0.07613,39,1
Corrianne Banzett,ID844359,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.1794,0.05742,38,1
Noni Marcellino,ID84458202,,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.2196,0.07451,28,1
Kacy Meltzer,ID844981,,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.235,0.07389,50,1
Elka Ortolani,ID84501001,,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.203,0.08243,60,1


Note: NaN in radius column

### Prepare data

### Check for missing values

In [189]:
df.isnull().sum()

id                    0
radius               71
texture               0
perimeter             0
area                  0
smoothness            0
compactness           0
concavity             0
symmetry              0
fractal_dimension     0
age                   0
diagnosis             0
dtype: int64

### Handle missing values
Remember to use "inplace=True"

In [191]:
# Figure out strategy to replace values where radius is NaN
df[df.isnull().any(axis=1)]

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X = df[['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness', 'concavity', 'symmetry', 'fractal_dimension', 'age', 'diagnosis']]
imp_mean.fit(X)

SimpleImputer()

In [193]:
# Replace values where radius is NaN based on above strategy

df2 = pd.DataFrame(imp_mean.transform(X), columns = ['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness', 'concavity', 'symmetry', 'fractal_dimension', 'age', 'diagnosis'])
df2.head(10)

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,concavity,symmetry,fractal_dimension,age,diagnosis
0,14.326635,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.2419,0.07871,35.0,1.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.1812,0.05667,27.0,1.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.2069,0.05999,31.0,1.0
3,14.326635,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.2597,0.09744,49.0,1.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1809,0.05883,20.0,1.0
5,14.326635,15.7,82.57,477.1,0.1278,0.17,0.1578,0.2087,0.07613,39.0,1.0
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.1794,0.05742,38.0,1.0
7,14.326635,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.2196,0.07451,28.0,1.0
8,14.326635,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.235,0.07389,50.0,1.0
9,14.326635,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.203,0.08243,60.0,1.0


### Check for missing values again

In [194]:
df2.isnull().sum()

radius               0
texture              0
perimeter            0
area                 0
smoothness           0
compactness          0
concavity            0
symmetry             0
fractal_dimension    0
age                  0
diagnosis            0
dtype: int64

### Drop non-numeric variables
Remember to use "inplace=True"

In [197]:
non_numeric_cols = df2.select_dtypes(exclude='number')
non_numeric_cols.columns
df2.drop(non_numeric_cols, axis=1, inplace=True)
df2.shape

(569, 11)

### Review updated dataframe

In [198]:
# Display first 10 rows
df2.head(10)

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,concavity,symmetry,fractal_dimension,age,diagnosis
0,14.326635,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.2419,0.07871,35.0,1.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.1812,0.05667,27.0,1.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.2069,0.05999,31.0,1.0
3,14.326635,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.2597,0.09744,49.0,1.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1809,0.05883,20.0,1.0
5,14.326635,15.7,82.57,477.1,0.1278,0.17,0.1578,0.2087,0.07613,39.0,1.0
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.1794,0.05742,38.0,1.0
7,14.326635,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.2196,0.07451,28.0,1.0
8,14.326635,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.235,0.07389,50.0,1.0
9,14.326635,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.203,0.08243,60.0,1.0


### Separate independent and dependent variables
* Independent variables: All remaining variables except Diagnosis
* Dependent variable: Diagnosis

In [199]:
# Prepare dataset for model training
df2['diagnosis'].unique()

array([1., 0.])

### Split data into training and test sets

In [200]:
# Split data into training data and test data
X = df2.iloc[:, :-1]
y = df2.iloc[:, -1] 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train model

In [201]:
# Train model
model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### If above results in error, review error message, look up documentation for LogisticRegression, and change model hyperparameter appropriately

In [202]:
# Train model again
model = LogisticRegression(max_iter = 200)
model.fit(x_train, y_train)

LogisticRegression(max_iter=200)

### Test model

In [203]:
# Generate predictions against the test set
predictions = model.predict(x_test)

# Print predictions
print(predictions)

[0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1.]


### Model evaluation

In [204]:
# Print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.96      0.93      0.94        71
         1.0       0.89      0.93      0.91        43

    accuracy                           0.93       114
   macro avg       0.92      0.93      0.93       114
weighted avg       0.93      0.93      0.93       114



In [205]:
# Print model accuracy
accuracy = model.score(x_test, y_test)
print("accuracy = ", accuracy * 100, "%")

accuracy =  92.98245614035088 %
