# Xbox Prediction Project
This notebook contains the code for loading datasets, preprocessing data, training machine learning models (RandomForestClassifier and LogisticRegression), and evaluating their performance.

## Load Datasets

In [17]:
# Import Libraries
import pandas as pd

In [18]:
train_data = pd.read_csv('D:\\College_Work\\Project\\Xbox_Prediction\\train.csv')
test_data = pd.read_csv('D:\\College_Work\\Project\\Xbox_Prediction\\test.csv')

In [19]:
train_data.head()

Unnamed: 0,user,sku,category,query,click_time,query_time
0,0001cd0d10bbc585c9ba287c963e00873d4c0bfd,2032076,abcat0701002,gears of war,2011-10-09 17:22:56.101,2011-10-09 17:21:42.917
1,00033dbced6acd3626c4b56ff5c55b8d69911681,9854804,abcat0701002,Gears of war,2011-09-25 13:35:42.198,2011-09-25 13:35:33.234
2,00033dbced6acd3626c4b56ff5c55b8d69911681,2670133,abcat0701002,Gears of war,2011-09-25 13:36:08.668,2011-09-25 13:35:33.234
3,00033dbced6acd3626c4b56ff5c55b8d69911681,9984142,abcat0701002,Assassin creed,2011-09-25 13:37:23.709,2011-09-25 13:37:00.049
4,0007756f015345450f7be1df33695421466b7ce4,2541184,abcat0701002,dead island,2011-09-11 15:15:34.336,2011-09-11 15:15:26.206


## Preprocess Data

In [20]:
# Import Libraries
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
# Tokenize the queries
train_data['query'] = [re.findall(r'\w+', i.lower()) for i in train_data['query'].fillna('NONE')]

In [22]:
# Remove stopwords and digits
stopwords_eng = stopwords.words('english')
filtered_queries = []
for query in train_data['query']:
    filtered_query = [word for word in query if word not in stopwords_eng and not word.isdigit()]
    filtered_queries.append(filtered_query)
train_data['filtered_query'] = filtered_queries

In [23]:
# Lemmatize the words
lemmatizer = WordNetLemmatizer()
lemmatized_queries = []
for query in train_data['filtered_query']:
    lemmatized_query = [lemmatizer.lemmatize(word, pos="v") for word in query]
    lemmatized_queries.append(lemmatized_query)
train_data['lemmatized_query'] = [' '.join(query) for query in lemmatized_queries]

In [24]:
train_data.head()

Unnamed: 0,user,sku,category,query,click_time,query_time,filtered_query,lemmatized_query
0,0001cd0d10bbc585c9ba287c963e00873d4c0bfd,2032076,abcat0701002,"[gears, of, war]",2011-10-09 17:22:56.101,2011-10-09 17:21:42.917,"[gears, war]",gear war
1,00033dbced6acd3626c4b56ff5c55b8d69911681,9854804,abcat0701002,"[gears, of, war]",2011-09-25 13:35:42.198,2011-09-25 13:35:33.234,"[gears, war]",gear war
2,00033dbced6acd3626c4b56ff5c55b8d69911681,2670133,abcat0701002,"[gears, of, war]",2011-09-25 13:36:08.668,2011-09-25 13:35:33.234,"[gears, war]",gear war
3,00033dbced6acd3626c4b56ff5c55b8d69911681,9984142,abcat0701002,"[assassin, creed]",2011-09-25 13:37:23.709,2011-09-25 13:37:00.049,"[assassin, creed]",assassin creed
4,0007756f015345450f7be1df33695421466b7ce4,2541184,abcat0701002,"[dead, island]",2011-09-11 15:15:34.336,2011-09-11 15:15:26.206,"[dead, island]",dead island


## Split Data, Extract Features and Labels, Vectorize

In [25]:
# Import libraries
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [26]:
# Split the data
train, test = train_test_split(train_data, test_size=0.2, random_state=42)

In [27]:
# Extract features and labels
train_features = train['lemmatized_query']
test_features = test['lemmatized_query']
train_labels = train['sku']
test_labels = test['sku']

In [28]:
# Vectorize the text data
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
train_features = vectorizer.fit_transform(train_features)
test_features = vectorizer.transform(test_features)

## Train Model

### RandomForestClassifier

In [29]:
# Train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=20, random_state=42)
model.fit(train_features, train_labels)
predictions = model.predict(test_features)

In [30]:
# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
scores = cross_val_score(model, train_features, train_labels, cv=5)
print(f"Accuracy: {accuracy}")
print(f"Cross-validation scores: {scores}")



Accuracy: 0.6253983240882804
Cross-validation scores: [0.63165659 0.63416433 0.6277663  0.62555326 0.63027442]


## Optimal Hyperparameters

In [31]:
# Perform Grid Search for hyperparameter tuning
parameters = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Perform Grid Search for hyperparameter tuning with parameters
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), parameters, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(train_features, train_labels)

Fitting 5 folds for each of 108 candidates, totalling 540 fits




In [None]:
# Get the best model from grid search
best_model = grid_search.best_estimator_

In [None]:
print(f"Best parameters: {grid_search.best_params_}")

In [None]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth = None, min_samples_leaf=1, min_samples_split=10, random_state=42)
rf_model.fit(train_features, train_labels)

In [None]:
# Make predictions with the best model
best_predictions = rf_model.predict(test_features)
best_accuracy = accuracy_score(test_labels, best_predictions)
best_scores = cross_val_score(rf_model, train_features, train_labels, cv=5)

In [None]:
# Compute precision, recall, and F1-score
report = classification_report(test_labels,best_predictions,output_dict=True)
print("Classification Report:\n", report)

In [None]:
# Debugging: Print the structure of the classification report
print("\nClassification Report Structure:\n", report)

In [None]:
# Extract precision, recall, and F1-score
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")

In [None]:
print(f"Best Model Accuracy: {best_accuracy}")
print(f"Best Model Cross-validation scores: {best_scores}")

### LogisticRegression

In [None]:
# Import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [None]:
# Extract features and labels
features = train_data['lemmatized_query']
labels = train_data['sku']

In [None]:
# Encode the target labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

In [None]:
# Split the data
train_features, test_features, train_labels_encoded, test_labels_encoded = train_test_split(
    features, labels_encoded, test_size=0.2, random_state=42)

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
train_features = vectorizer.fit_transform(train_features)
test_features = vectorizer.transform(test_features)

In [None]:
# Initialize the LogisticRegression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)

In [None]:
lr_model.fit(train_features, train_labels_encoded)

In [None]:
# Make predictions with the best model
lr_predictions = lr_model.predict(test_features)
lr_accuracy = accuracy_score(test_labels_encoded,lr_predictions)
lr_scores = cross_val_score(lr_model, train_features, train_labels_encoded, cv=5)

In [None]:
print(f"LogisticRegression Model Accuracy: {lr_accuracy}")
print(f"LogisticRegression Model Cross-validation scores: {lr_scores}")

In [None]:
# Ensure the classification report aligns with the predicted classes
unique_labels = np.unique(test_labels_encoded)

In [None]:
# Compute precision, recall, and F1-score
report = classification_report(test_labels_encoded, lr_predictions, labels=unique_labels, target_names=label_encoder.classes_[unique_labels], output_dict=True)

In [None]:
# Debugging: Print the structure of the classification report
print("\nClassification Report Structure:\n", report)

In [None]:
# Extract precision, recall, and F1-score
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")