In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import mean_squared_error



## Loading Data:

In [None]:
x = pd.read_csv("./pc_X_train.csv")
y = pd.read_csv("./pc_Y_train.csv")

y.score= y.score.astype(object)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=42)



## Data Exploration:



In [None]:
print(x.shape)
print(y.shape)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
x.head

In [None]:
y_train_pred = y_train["score"]
y_test_pred = y_test["score"]
y_train_pred.describe() #basic statistics


In [None]:
y_train_pred.value_counts() #the number of different values

## Data Transformation:

In [None]:
x_trans = Normalizer().fit_transform(x_train)
x_trans.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize the distribution of the target variable in the training set
sns.countplot(x="score", data=y_train)
plt.title("Distribution of Scores in Training Set")
plt.show()

# Visualize the distribution of the target variable in the testing set
sns.countplot(x="score", data=y_test)
plt.title("Distribution of Scores in Testing Set")
plt.show()


## Feature Scaling:


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [None]:
y_train_pred = y_train["score"].astype(float).values.astype(int)
y_test_pred = y_test["score"].astype(float).values.astype(int)

lda = LinearDiscriminantAnalysis()
lda.fit(x_train_scaled, y_train_pred)
print(np.unique(y_train_pred))



## Make predictions on the test set


In [None]:
y_pred = lda.predict(x_test_scaled)

In [None]:
# Check unique values
print("Unique values in y_test_pred:", np.unique(y_test_pred))
print("Unique values in y_pred:", np.unique(y_pred))


In [None]:
# Check for missing values
print("Missing values in y_test_pred:", pd.Series(y_test_pred).isnull().sum())
print("Missing values in y_pred:", pd.Series(y_pred).isnull().sum())

In [None]:
# Print a few examples of true and predicted labels
for true_label, predicted_label in zip(y_test_pred[:10], y_pred[:10]):
    print(f"True: {true_label}, Predicted: {predicted_label}")



## Print classification report and confusion matrix


In [None]:
print("Classification Report:\n", classification_report(y_test_pred, y_pred))


In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test_pred, y_pred))

## Feature Scaling


In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Train a Linear Regression model


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Load training data
x_train = pd.read_csv("./pc_X_train.csv")
y_train = pd.read_csv("./pc_Y_train.csv")

# Convert 'score' column to object type
y_train.score = y_train.score.astype(object)

# Train-test split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Extract the 'score' column as the target variable
y_train = y_train["score"]
y_val = y_val["score"]

# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

# Model selection and tuning (linear regression in this case)
param_grid = {'fit_intercept': [True, False]}
regression_model = LinearRegression()
grid_search = GridSearchCV(regression_model, param_grid, cv=5)
grid_search.fit(x_train_scaled, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(x_val_scaled)

# Calculate RMSE on the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Root Mean Squared Error on Validation Set:", rmse_val)

# Load the test data
x_test = pd.read_csv("./pc_X_test.csv")

# Standardize the test features using the same scaler from the training set
x_test_scaled = scaler.transform(x_test)

# Make predictions on the test set
y_test_pred = best_model.predict(x_test_scaled)

# Save predictions to a CSV file
predictions_df = pd.DataFrame({'Id': x_test.index, 'score': y_test_pred})
predictions_df.to_csv('pc_y_test_predicted.csv', index=False)


## Random Forest Regressor


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# Load training data
x_train = pd.read_csv("./pc_X_train.csv")
y_train = pd.read_csv("./pc_Y_train.csv")

# Convert 'score' column to object type
y_train.score = y_train.score.astype(object)

# Train-test split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Extract the 'score' column as the target variable
y_train = y_train["score"]
y_val = y_val["score"]

# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

# Model selection and tuning (Random Forest in this case)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

random_forest_model = RandomForestRegressor(random_state=42)
print("Before Grid Search")
grid_search = GridSearchCV(random_forest_model, param_grid, cv=5)
print("Before Fit")
grid_search.fit(x_train_scaled, y_train)
print("After Fit")
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(x_val_scaled)

# Calculate RMSE on the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Root Mean Squared Error on Validation Set:", rmse_val)

# Load the test data
x_test = pd.read_csv("./pc_X_test.csv")

# Standardize the test features using the same scaler from the training set
x_test_scaled = scaler.transform(x_test)

# Make predictions on the test set
y_test_pred = best_model.predict(x_test_scaled)

# Save predictions to a CSV file
predictions_df = pd.DataFrame({'Id': x_test.index, 'score': y_test_pred})
predictions_df.to_csv('pc_y_test_predicted_rf.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV

import numpy as np
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score





x_train = pd.read_csv("./pc_X_train.csv")
y_train = pd.read_csv("./pc_Y_train.csv")
df_test = pd.read_csv("./pc_X_test.csv")

f3 = make_scorer(fbeta_score, beta=3, average='weighted')

# Convert 'score' column to integer type
y_train.score = y_train.score.astype(int)


# Train-test split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Extract the 'score' column as the target variable
y_train = y_train["score"]
y_val = y_val["score"]

# Define the pipeline
pipeRandFor = Pipeline([
    ('imp', SimpleImputer(strategy='median')),
    ('pca', PCA(n_components=90, random_state=41)),
    ('RandFor', RandomForestClassifier(n_estimators=30, max_depth=9, random_state=41,
                                       n_jobs=-1, class_weight='balanced_subsample'))
])

# Cross-validate
cvsRandFor = cross_val_score(pipeRandFor, x_train, y_train, cv=3, scoring=f3, n_jobs=-1)
print(cvsRandFor)

# Fit the pipeline on the training data
pipeRandFor.fit(x_train, y_train)

# Predict on the validation data
y_pred_RandFor = pipeRandFor.predict(x_val)

# Print confusion matrix and fbeta_score on the validation data
print(confusion_matrix(y_val, y_pred_RandFor))
print(fbeta_score(y_val, y_pred_RandFor, beta=3, average='weighted'))

# Assuming 'df_test' is your test data
y_pred_RandFor_test = pipeRandFor.predict(df_test)

# Save predictions to a CSV file
out_RandFor = pd.DataFrame(y_pred_RandFor_test.astype(int))
out_RandFor.index = np.arange(0, len(out_RandFor))
out_RandFor.to_csv("out_RandFor.csv", header=['score'], index=True, index_label="Id")

# Calculate the Root Mean Square Error (RMSE)
rmse_RandFor = np.sqrt(mean_squared_error(y_val, y_pred_RandFor))
print(f'Root Mean Square Error (RMSE): {rmse_RandFor}')
