In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics, svm
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from xgboost import XGBClassifier


<h1>Configuration</h1>

In [None]:
random_state = 42
path_to_dataset = ""
index_col = ""
target_variable = ""

# Test/train split size
test_size = 0.3

# KNN max iterations 
# adjust this to 50 or even 150 if you have lots of data rows
max_iterations = 20

<h1>Dataset loading</h1>

In [None]:
# Loads dataset from file
# Provide index_col in order to avoid spawning of index column "Unnamed: 0"
df = pd.read_csv(path_to_dataset, index_col=index_col)
df = df.drop("Country", axis=1)
# initialize DF for saving metrics
metrics_df = pd.DataFrame({
    'Metric': ['Average accuracy']})
df.head()

<h1>Split the dataset</h1>

In [None]:
# First, split to X (data, all features except target variable) and y (only target variable)
X = df.drop([target_variable], axis=1)
y = df[target_variable]

label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(y)
 
# Then split to train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)


In [None]:
# initialize the scaler and process X-values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

<h1>Models</h1>

<table>
    <tr>
        <th>Algorithm</th>
        <th>Pros</th>
        <th>Cons</th>
        <th>Data requirements</th>
    </tr>
    <tr>
        <th>Linear regression</th>
        <th>Simple and easy to interpret</th>
        <th>Sensitive to outliers</th>
        <th>• Requires linear relationship between variables
            • Independent data points
            • Normal distribution
            • Constant variance between data and
            predictions
        </th>
    </tr>
    <tr>
        <th>Support Vector Machines (SVM)</th>
        <th>Effective in high-dimensional spaces (high number of columns)</th>
        <th>• Can be slow and memory-intensive for large datasets (amount of rows)
        • Computationally intensive
        • Optimization can be tedious</th>
        <th>Data scaling required</th>
    </tr>
    <tr>
        <th>Random forests</th>
        <th>Handles non-linear relationships well</th>
        <th>• Can overfit with noisy and overlapping data
            • Can be slow for real-time prediction</th>
        <th>Less preprocessing, handles missing values well</th>
    </tr>
    <tr>
        <th>KNN</th>
        <th>Simple and easy to understand</th>
        <th>• Computationally expensive for large datasets
            • Sensitive to irrelevant variables
            • Prediction can be slow for large datasets</th>
        <th>Data balance is important (equal representation for each output class)</th>
    </tr>
</table>

<h3>Logical Regression</h3>

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
predictions = logmodel.predict(X_test)


In [None]:
# print the classification report based on true values and predictions
print(classification_report(y_test, predictions, target_names=df["World Region"].unique()))

# get overall accuracy of the model and print it
acc = accuracy_score(y_test, predictions)
metrics_df["LogReg"] = acc
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))

In [None]:
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, fmt="g")

<h3>SVM</h3>

In [None]:
model = make_pipeline(StandardScaler(), svm.SVC(probability=True))
model.fit(X_train, y_train)
predictions = model.predict(X_test)


In [None]:
# as usual
# print the classification report based on true values and predictions
print(classification_report(y_test, predictions, target_names=df["World Region"].unique()))
# get overall accuracy of the model and print it
acc = accuracy_score(y_test, predictions)
metrics_df["SVC"] = acc
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))
print("\n")

In [None]:
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, fmt="g")

<h3>Random forests</h3>


In [None]:
model = make_pipeline(StandardScaler(), RandomForestClassifier())
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
# as usual
# print the classification report based on true values and predictions
print(classification_report(y_test, predictions, target_names=df["World Region"].unique()))
# get overall accuracy of the model and print it
acc = accuracy_score(y_test, predictions)
metrics_df["Random Forest"] = acc
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))
print("\n")


In [None]:
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, fmt="g")

<h3>XGB<h3>

In [None]:
model = XGBClassifier()
#Training the model on the training data
model.fit(X_train, y_train)

#Making predictions on the test set
predictions = model.predict(X_test)



In [None]:
# as usual
# print the classification report based on true values and predictions
print(classification_report(y_test, predictions, target_names=df["World Region"].unique()))
# get overall accuracy of the model and print it
acc = accuracy_score(y_test, predictions)
metrics_df["XGB"] = acc
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))
print("\n")


In [None]:
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, fmt="g")

<h3>KNN</h3>

In [None]:
# list to save the error values in the loop
error = []

# the default metric in scikit-learn
metric_selection = "minkowski"

# Calculating error for K values between 1 and 40
for i in range(1, max_iterations):

     # try with current k-value, train the model and make a test prediction
    knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=i, metric=metric_selection))
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)

    # save the error value for this k-value
    error.append(np.mean(pred_i != y_test))
    
# plot all the k-values and their error values
sns.lineplot(x=range(1, max_iterations), y=error, marker='o')
plt.title('Error Rate K Value (choose k-value of lowest Mean Error)')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

# choose best k-value based on metrics
# we have to add +1 to k-value, since argmin() returns an index
# which starts from 0, but k-values start from 1
print("Based on iterations, best k-value is this:")
k_value = np.argmin(error) + 1
print(k_value)

# build the model with the optimal values
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=k_value, metric=metric_selection))
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
# as usual
# print the classification report based on true values and predictions
print(classification_report(y_test, predictions, target_names=df["World Region"].unique()))
# get overall accuracy of the model and print it
acc = accuracy_score(y_test, predictions)
metrics_df["KNN"] = acc
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))
print("\n")

In [None]:
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, fmt="g")

In [None]:
metrics_df