This notebook provides code for KNN predictions only, for EDA kindly refer to the provided app.py script for dashboard

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt

## Dataset 1: Student Dropout Prediction

In [2]:
# Function 1: Data Preprocessing
def preprocess_data(path):
    """
    Function to preprocess the raw data.
    This can include tasks like handling missing values, encoding categorical variables, etc.
    """
    # Load the Data
    data = pd.read_csv(path, delimiter=';')

    # Data Preprocessing
    # Rename wrong column name
    data.rename(columns={'Nacionality': 'Nationality'}, inplace=True)
    data.rename(columns={'Daytime/evening attendance\t': 'Daytime/evening attendance'}, inplace=True)
    
    # Check for missing values
    #  print(data.isnull().sum()) # no null value
    return data

In [3]:
# Function 2: Feature Engineering
def engineer_features(data):
    """
    Function to engineer new features or transform existing ones.
    This can involve tasks like scaling, creating interaction terms, etc.
    """
    y = data['Target']  # Target variable
    X = data[['Previous qualification', "Mother's qualification", "Father's qualification"]]  # Features
    
    # Covert categorical target variable to numerical values
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    # X_reshaped = np.array(X).reshape(-1, 1)
    
    return X, y_encoded

In [4]:
# Function 3: Model Training
def train_model(X, y):
    """
    Train a KNN classifier model.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize the KNN classifier
    model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors (k) as needed
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    return model, X_test, y_test, y_pred

In [5]:
# Function 5: Model Evaluation (for regression)
def evaluate_model(X_test, y_test, y_pred):
    """
    Function to evaluate a regression model.
    """
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    print("Mean Absolute Error:", mae)
    print("Mean Squared Error:", mse)
    print("R-squared:", r2)
    print("Accuracy:", accuracy)
    print('Classification Report:\n', class_report)
    return accuracy

In [6]:
# Master Function to Control Workflow
def run_pipeline(path_to_data):
    """
    Master function to control the workflow/pipeline execution.
    """
    # Step 1: Data Preprocessing
    preprocessed_data = preprocess_data(path_to_data)
    
    # Step 2: Feature Engineering
    X, y = engineer_features(preprocessed_data)
    
    # Step 3: Model Training
    model, X_test, y_test, y_pred = train_model(X, y)
    
    # Step 4: Model Evaluation
    evaluation_result = evaluate_model(X_test, y_test, y_pred)
    
    return model, evaluation_result

In [7]:
path_to_data = "Data\data.csv"
model, accuracy = run_pipeline(path_to_data)

Mean Absolute Error: 0.7943502824858757
Mean Squared Error: 1.3932203389830509
R-squared: -0.707179130957186
Accuracy: 0.5050847457627119
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.41      0.45       316
           1       0.19      0.05      0.07       151
           2       0.52      0.74      0.61       418

    accuracy                           0.51       885
   macro avg       0.41      0.40      0.38       885
weighted avg       0.46      0.51      0.46       885



## Dataset 2: National Poll on Healthy Aging

In [8]:
# Load your dataset
data = pd.read_csv('Data/NPHA-doctor-visits.csv', delimiter=',')

# Selecting features (X) and target variable (y)
X = data[['Number of Doctors Visited']]
y = data['Phyiscal Health']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

# Initialize the linear regression model
model = KNeighborsClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test data
predictions = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
accuracy = 
class_report = classification_report(y_test, predictions)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R-squared:', r2)
print('Classification Report:\n', class_report)

Mean Absolute Error: 0.6046511627906976
Mean Squared Error: 0.7441860465116279
R-squared: -0.07836990595611293
Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00        64
           3       0.47      1.00      0.63       100
           4       0.00      0.00      0.00        36
           5       0.00      0.00      0.00         2

    accuracy                           0.47       215
   macro avg       0.09      0.20      0.13       215
weighted avg       0.22      0.47      0.30       215



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Dataset 3: Paper Reviews

In [9]:
# Load your dataset
path = r'E:\IBA\Semester 1\Machine Learning 1\Assignment\Assignment1\Data Piplines\Data\reviews.json'
with open(path, 'r', encoding='utf-8') as json_file:
    # Load JSON data into a Python dictionary
    data = dict(json.load(json_file))
    
flattened_data = []
for entry in data['paper']:
    for review in entry['review']:
        entry_data = {
            'paper_id': entry['id'],
            'preliminary_decision': entry['preliminary_decision'],
            'review_id': review['id'],
            'confidence': review['confidence'],
            'evaluation': review['evaluation'],
            'lan': review['lan'],
            'orientation': review['orientation'],
            'remarks': review['remarks'],
            'text': review['text'],
            'timespan': review['timespan']
        }
        flattened_data.append(entry_data)
data = pd.DataFrame(flattened_data)

# Encode string
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['preliminary_decision'])

# Selecting features (X) and target variable (y)
X = data[['confidence']]  # Feature: Confidence score
y = y_encoded  # Target: Preliminary decision (accept/reject)

y = np.array(y).reshape(-1, 1)

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
y_imputed = imputer.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_imputed, test_size=0.3, random_state=123)

# Initialize the Random Forest Classifier model
model = KNeighborsClassifier()

# Train the model
model.fit(X_train, y_train.ravel())  # Note: ravel() is used to flatten y_train

# Make predictions on the test data
predictions = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
class_report = classification_report(y_test, predictions)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R-squared:', r2)
print('Classification Report:\n', class_report)

Mean Absolute Error: 1.0327868852459017
Mean Squared Error: 3.0491803278688523
R-squared: -0.4767668879344007
Classification Report:
               precision    recall  f1-score   support

         0.0       0.65      0.92      0.76        75
         2.0       0.00      0.00      0.00         3
         3.0       0.62      0.23      0.33        44

    accuracy                           0.65       122
   macro avg       0.43      0.38      0.37       122
weighted avg       0.63      0.65      0.59       122



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Rough work

In [10]:
# data['Inflation rate']
# data.columns
# data.describe()
# plt.figure(figsize=(12, 8))
# sns.boxplot(data=data['Unemployment rate'])  # Exclude the target variable from the plot
# plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
# plt.title('Boxplot of the Data Features')
# plt.tight_layout()
# plt.show()
# print(model.predict([[2],[-0.3]]))
# print(data['preliminary_decision'].unique())