In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, recall_score
import seaborn as sns

In [None]:
df = pd.read_csv('')

### Cleaning

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
#drop columns that account > ~ 10% NaN values
df.drop(columns=[''],  inplace=True)

In [None]:
#drop rows with NaN values
df.dropna(inplace=True)

In [None]:
df.isna().sum()

### Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.''.value_counts()

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(), center=0)

In [None]:
#plot feature against target to see how well a feature "separates" the classes
#how would you do this for a categorical variable like RainToday
fig, ax = plt.subplots()
ax.scatter(df.feature,df.target)
plt.show()

In [None]:
df.describe()

In [None]:
def  draw_boxplots(var):
    f, ax = plt.subplots(figsize=(12, 6))
    fig = sns.boxplot(x=var, y="price", data=df)
    fig.axis(ymin=0, ymax=3500000);

### Confusion Matrix

In [None]:
machine_dataset['TP'] = np.where((machine_dataset['potato']==1) & (machine_dataset['label']==1), 1, 0)

In [None]:
# False Positives 
machine_dataset['FP'] = np.where((machine_dataset['potato']==0) & (machine_dataset['label']==1), 1, 0)

In [None]:
# True Negatives 
machine_dataset['TN'] = np.where((machine_dataset['potato']==0) & (machine_dataset['label']==0), 1, 0)

In [None]:
# False Negatives
machine_dataset['FN'] = np.where((machine_dataset['potato']==1) & (machine_dataset['label']==0), 1, 0)

In [None]:
tn, fp, fn, tp = confusion_matrix(machine_dataset.potato, machine_dataset.label).ravel()

In [None]:
machine_dataset.sum()

In [None]:
# True Positive Rate (Sensitivity, Recall)
# TP/(all true positives)
truep = machine_dataset.TP.sum()/(machine_dataset.TP.sum()+machine_dataset.FN.sum())


# True Negative Rate (Specificity)
# TN/(all true negatives)
spec = machine_dataset.TN.sum()/(machine_dataset.TN.sum()+machine_dataset.FP.sum())

# False Positive Rate 
# FP/(All true negatives) or 1-Specificity
fpr = 1-spec

print(truep, spec, fpr)

In [None]:
# Import some functions
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr = roc_curve(machine_dataset.potato, machine_dataset.measure)[:2]
auc = roc_auc_score(machine_dataset.potato, machine_dataset.measure)

In [None]:
roc_curve(machine_dataset.potato, machine_dataset.measure)

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression(solver='newton-cg')
log_reg.fit(distance, makes)

In [None]:
prob_1 = [proba[1] for proba in log_reg.predict_proba(distance)]

In [None]:
fig, ax = plt.subplots()
ax.scatter(distance,makes)
ax.scatter(distance, prob_1)
plt.show()

In [None]:
#first model uses only pressure at 3p to predict whethter it will rain tomrorow
#can you improve it? 
X = df[['WindSpeed3pm', 'WindSpeed9am', ]]
y = df['RainTomorrow']

In [None]:
#train_test_littttttt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#initialize our model
log_reg_rain = LogisticRegression(max_iter=1000)

In [None]:
#fit our model with our training data
log_reg_rain.fit(X_train,y_train)

In [None]:
predictions = log_reg_rain.predict(X_test)

In [None]:
#how well did our model predict our correct class? 
accuracy_score(y_test, predictions)

### K Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
# the n_neighbors is the parameter where you specify k

In [None]:
knn.fit(X, y)

In [None]:
knn.predict([[3, 5, 4, 2]])

In [None]:
# tuning the k parameter 
# instantiate the model (using the value K=5)
knn = KNeighborsClassifier(n_neighbors=5)

# fit the model with data
knn.fit(X, y)

# predict the response for new observations
knn.predict(X_new)

In [None]:
# import libraries
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# save "bunch" object containing iris dataset and its attributes
iris = load_iris()

# store feature matrix in "X"
X = iris.data

# store response vector in "y"
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  


model = KNeighborsClassifier(n_neighbors=1)
model.fit(X_train, y_train)

y_predict = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score
# F1 = 2 * (precision * recall) / (precision + recall)

f1_score(y_test, y_predict, average='weighted')

In [None]:
print(confusion_matrix(y_test, y_predict, labels=None, sample_weight=None))
print(classification_report(y_test, y_predict))

In [None]:
# search for an optimal value of K for KNN
k_range = list(range(1, 11))
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_predict = knn.predict(X_test)
    score = f1_score(y_test, y_predict, average='weighted')
    k_scores.append( score)
print(k_scores)

In [None]:
import matplotlib.pyplot as plt  

plt.figure(figsize=(12, 6))  
plt.plot(range(1, 11), k_scores, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('F1 score by K Value')  
plt.xlabel('K Value')  
plt.ylabel('F1 Score') 
plt.show()

### Decision Tree Classification (AdaBoost)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO
from sklearn.ensemble import AdaBoostClassifier
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.metrics import mean_squared_error

In [None]:
# splitting the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [None]:
ctree=DecisionTreeClassifier(max_depth = 2)
ctree.fit(X_train,y_train)

In [None]:
## adaboost
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2, n_estimators =200,
                                                   algorithm = "SAMME.R", learning_rate= 0.5))
ada_clf.fit(X_train, y_train)

In [None]:
dot_data = StringIO()
export_graphviz(ctree, out_file=dot_data,  
                rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
clasPred = ctree.predict(X_test)

In [None]:
# comparing the results and plot them 
accuracy_score(y_test_iris, clasPred)

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(ctree, X_iris, y_iris, cv = 10)
score.mean()
depth_range = range(1,10)
val = []
for depth in depth_range:
    ctree = DecisionTreeClassifier(max_depth = depth)
    depth_score = cross_val_score(ctree, X_iris, y_iris, cv = 10)
    val.append(depth_score.mean())
print(val)
plt.figure(figsize = (10,10))
plt.plot(depth_range, val)
plt.xlabel('range of depth')
plt.ylabel('cross validated values')
plt.show()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes= 16, n_jobs=-1)

In [None]:
rnd_clf.fit(X_train, y_train)

In [None]:
y_pred_rf = rnd_clf.predict(X_test)