In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

# Part 1

In [None]:
# Load data
houses = pd.read_csv("data_assignment2.csv")
houses

In [None]:
# Scatter plot living area vs prices
plt.scatter(houses[['Living_area']], houses[['Selling_price']],  c = 'b', s = 50, alpha = 0.4)
plt.title('Scatterplot')
plt.xlabel('Living area')
plt.ylabel('Selling price (million)')
plt.show()

In [None]:
# Check for NaN
houses[['Living_area','Selling_price']].isnull().values.any()

In [None]:
# Fit linear regression model area and price
model = LinearRegression().fit(houses[['Living_area']], houses[['Selling_price']])

In [None]:
# Plot data and fit
xfit=np.linspace(min(houses['Living_area']),max(houses['Living_area']), 1000) #1000 evenly spaced points in [0, 55].
yfit=model.predict(xfit[:, np.newaxis])
plt.scatter(houses[['Living_area']], houses[['Selling_price']], c = 'b', s = 50, alpha = 0.4)
plt.plot(xfit, yfit)
plt.title('Regression line')
plt.xlabel('Living area')
plt.ylabel('Selling price (million)')
plt.show()

In [None]:
# Residual plot
pred_price = model.predict(houses[['Living_area']])
residuals = pred_price - houses[['Selling_price']]
plt.scatter(pred_price, residuals, c = 'b', s = 50, alpha = 0.4)
plt.hlines(y = 0, xmin = 3400000, xmax = 6500000)
plt.title('Residual plot')
plt.ylabel('Residuals')

In [None]:
# Extract indicies of outliers > |1 700 00|
outliers_ind = residuals[abs(residuals) > 1700000].dropna()
outliers_ind = outliers_ind.index
outliers_ind

In [None]:
# Remove indicies from data
houses = houses.drop(houses.index[outliers_ind])
houses

In [None]:
# New model linear regression area and price
model = LinearRegression().fit(houses[['Living_area']], houses[['Selling_price']])

In [None]:
# Scatter plot of data and fitted model
xfit=np.linspace(min(houses['Living_area']),max(houses['Living_area']), 1000) #1000 evenly spaced points in [0, 55].
yfit=model.predict(xfit[:, np.newaxis])
plt.scatter(houses[['Living_area']], houses[['Selling_price']], c = 'b', s = 50, alpha = 0.4)
plt.plot(xfit, yfit)
plt.title('Regression line')
plt.xlabel('Living area')
plt.ylabel('Selling price (million)')
plt.show()

In [None]:
# Slope
print(model.coef_)

# Intersection
print(model.intercept_)

In [None]:
# Output of predictions: area 100^2, 150^2 and 200^2 
print(model.predict([[100]]))
print(model.predict([[150]]))
print(model.predict([[200]]))

In [None]:
# Residual plot new model
pred_price = model.predict(houses[['Living_area']])
residuals = pred_price - houses[['Selling_price']]
plt.scatter(pred_price, residuals, c = 'b', s = 50, alpha = 0.4)
plt.hlines(y = 0, xmin = 3000000, xmax = 7200000)
plt.title('Residual plot')
plt.ylabel('Residuals')


# Part 2

In [None]:
# load iris data and create dataframe
from sklearn.datasets import load_iris
iris_raw = load_iris()
iris_raw

In [None]:
# Split into trainging and test sets
x_train, x_test, y_train, y_test = train_test_split(iris_raw.data, 
                                                    iris_raw.target, 
                                                    test_size=0.25, random_state=0)

logReg = LogisticRegression(multi_class='ovr', solver='liblinear')

# Fit logistic regression model to training data
logReg_model = logReg.fit(x_train, y_train)

targets=['Setosa','Vercicolor', 'Virginica']

# PLot confusion matrix
fig = plt.figure()
plot_confusion_matrix(logReg_model,x_test,y_test, cmap='Blues', display_labels=targets)
plt.title('Confusion matrix logistic regression')

In [None]:
# Knn different k, uniform weights
knn_n_values = [1, 6, 11, 23]
knn_uniform = [KNeighborsClassifier(n_neighbors=n, weights='uniform') for n in knn_n_values]

for c_uniform in knn_uniform:
    c_uniform.fit(x_train, y_train)

In [None]:
# Confusion matrix
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))

for c_uniform, ax in zip(knn_uniform, axes.flatten()):
    plot_confusion_matrix(c_uniform, x_test, y_test, ax=ax, cmap='Blues',display_labels=['Setosa','Vercicolor', 'Virginica'])
    ax.set_title('Neighbors = '+str(c_uniform.n_neighbors))

plt.tight_layout()  
fig.suptitle('KNN classifiers with uniform weights')
plt.show()

In [None]:
# Knn different k, distance weights
knn_distance = [KNeighborsClassifier(n_neighbors=n, weights='distance') for n in knn_n_values]
                
for c_distance in knn_distance:
    c_distance.fit(x_train, y_train)

In [None]:
# Confusion matrix
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))

for c_distance, ax in zip(knn_distance, axes.flatten()):
    plot_confusion_matrix(c_distance, x_test, y_test, ax=ax, cmap='Blues',display_labels=['Setosa','Vercicolor', 'Virginica'])
    ax.set_title('Neighbors = '+str(c_distance.n_neighbors))
plt.tight_layout()  
fig.suptitle('KNN classifiers with distance weights')
plt.show()

In [None]:
from sklearn import metrics

In [None]:
test_log_reg_pred = logReg_model.predict(x_test)
test_knn_uni_pred = [knn_uniform[i].predict(x_test) for i in range(4)]
test_knn_dist_pred = [knn_distance[i].predict(x_test) for i in range(4)]

In [None]:
def get_metrics(prediction, average=None):
    accuracy = metrics.accuracy_score(y_test, prediction)
    precision = metrics.precision_score(y_test, prediction, average=average)
    f_score = metrics.f1_score(y_test, prediction, average=average)
    recall = metrics.recall_score(y_test, prediction, average=average)
    
    return (accuracy,), precision, f_score, recall

In [None]:
metrics_log_reg = get_metrics(test_log_reg_pred)
metrics_log_reg

In [None]:
metrics_knn_uni = [get_metrics(test_knn_uni_pred[i]) for i in range(4)]
metrics_knn_uni

In [None]:
metrics_knn_dist = [get_metrics(test_knn_dist_pred[i]) for i in range(4)]
metrics_knn_dist

In [None]:
def metrics_to_dataframe_single(metrics):
    df = pd.DataFrame(data=metrics, index=["Accuracy", "Precision", "F-score", "Recall"], columns=targets)
    df = df.T.fillna(method="ffill")
    return df

In [None]:
def metrics_to_dataframe_multi(metrics):
    frame = [metrics_to_dataframe_single(metric) for metric in metrics]
    comb = pd.concat({i:v for i,v in zip(knn_n_values, frame)}, axis=0)
    return comb

In [None]:
metrics_to_dataframe_single(metrics_log_reg)

In [None]:
metrics_to_dataframe_multi(metrics_knn_uni)

In [None]:
metrics_to_dataframe_multi(metrics_knn_dist)