# Xbox Prediction Project
This notebook contains the code for loading datasets, preprocessing data, training machine learning models (RandomForestClassifier and LogisticRegression), and evaluating their performance.

## Load Datasets

In [17]:
# Import Libraries
import pandas as pd

In [18]:
train_data = pd.read_csv('D:\\College_Work\\Project\\Xbox_Prediction\\train.csv')
test_data = pd.read_csv('D:\\College_Work\\Project\\Xbox_Prediction\\test.csv')

In [19]:
train_data.head()

Unnamed: 0,user,sku,category,query,click_time,query_time
0,0001cd0d10bbc585c9ba287c963e00873d4c0bfd,2032076,abcat0701002,gears of war,2011-10-09 17:22:56.101,2011-10-09 17:21:42.917
1,00033dbced6acd3626c4b56ff5c55b8d69911681,9854804,abcat0701002,Gears of war,2011-09-25 13:35:42.198,2011-09-25 13:35:33.234
2,00033dbced6acd3626c4b56ff5c55b8d69911681,2670133,abcat0701002,Gears of war,2011-09-25 13:36:08.668,2011-09-25 13:35:33.234
3,00033dbced6acd3626c4b56ff5c55b8d69911681,9984142,abcat0701002,Assassin creed,2011-09-25 13:37:23.709,2011-09-25 13:37:00.049
4,0007756f015345450f7be1df33695421466b7ce4,2541184,abcat0701002,dead island,2011-09-11 15:15:34.336,2011-09-11 15:15:26.206


## Preprocess Data

In [20]:
# Import Libraries
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
# Tokenize the queries
train_data['query'] = [re.findall(r'\w+', i.lower()) for i in train_data['query'].fillna('NONE')]

In [22]:
# Remove stopwords and digits
stopwords_eng = stopwords.words('english')
filtered_queries = []
for query in train_data['query']:
    filtered_query = [word for word in query if word not in stopwords_eng and not word.isdigit()]
    filtered_queries.append(filtered_query)
train_data['filtered_query'] = filtered_queries

In [23]:
# Lemmatize the words
lemmatizer = WordNetLemmatizer()
lemmatized_queries = []
for query in train_data['filtered_query']:
    lemmatized_query = [lemmatizer.lemmatize(word, pos="v") for word in query]
    lemmatized_queries.append(lemmatized_query)
train_data['lemmatized_query'] = [' '.join(query) for query in lemmatized_queries]

In [24]:
train_data.head()

Unnamed: 0,user,sku,category,query,click_time,query_time,filtered_query,lemmatized_query
0,0001cd0d10bbc585c9ba287c963e00873d4c0bfd,2032076,abcat0701002,"[gears, of, war]",2011-10-09 17:22:56.101,2011-10-09 17:21:42.917,"[gears, war]",gear war
1,00033dbced6acd3626c4b56ff5c55b8d69911681,9854804,abcat0701002,"[gears, of, war]",2011-09-25 13:35:42.198,2011-09-25 13:35:33.234,"[gears, war]",gear war
2,00033dbced6acd3626c4b56ff5c55b8d69911681,2670133,abcat0701002,"[gears, of, war]",2011-09-25 13:36:08.668,2011-09-25 13:35:33.234,"[gears, war]",gear war
3,00033dbced6acd3626c4b56ff5c55b8d69911681,9984142,abcat0701002,"[assassin, creed]",2011-09-25 13:37:23.709,2011-09-25 13:37:00.049,"[assassin, creed]",assassin creed
4,0007756f015345450f7be1df33695421466b7ce4,2541184,abcat0701002,"[dead, island]",2011-09-11 15:15:34.336,2011-09-11 15:15:26.206,"[dead, island]",dead island


## Split Data, Extract Features and Labels, Vectorize

In [25]:
# Import libraries
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [26]:
# Split the data
train, test = train_test_split(train_data, test_size=0.2, random_state=42)

In [27]:
# Extract features and labels
train_features = train['lemmatized_query']
test_features = test['lemmatized_query']
train_labels = train['sku']
test_labels = test['sku']

In [28]:
# Vectorize the text data
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
train_features = vectorizer.fit_transform(train_features)
test_features = vectorizer.transform(test_features)

## Train Model

### RandomForestClassifier

In [29]:
# Train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=20, random_state=42)
model.fit(train_features, train_labels)
predictions = model.predict(test_features)

In [30]:
# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
scores = cross_val_score(model, train_features, train_labels, cv=5)
print(f"Accuracy: {accuracy}")
print(f"Cross-validation scores: {scores}")



Accuracy: 0.6253983240882804
Cross-validation scores: [0.63165659 0.63416433 0.6277663  0.62555326 0.63027442]


## Optimal Hyperparameters

In [31]:
# Perform Grid Search for hyperparameter tuning
parameters = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [32]:
# Perform Grid Search for hyperparameter tuning with parameters
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), parameters, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(train_features, train_labels)

Fitting 5 folds for each of 108 candidates, totalling 540 fits




0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [None, 10, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], 'n_estimators': [50, 100, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
# Get the best model from grid search
best_model = grid_search.best_estimator_

In [34]:
print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


In [35]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth = None, min_samples_leaf=1, min_samples_split=10, random_state=42)
rf_model.fit(train_features, train_labels)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [36]:
# Make predictions with the best model
best_predictions = rf_model.predict(test_features)
best_accuracy = accuracy_score(test_labels, best_predictions)
best_scores = cross_val_score(rf_model, train_features, train_labels, cv=5)



In [37]:
# Compute precision, recall, and F1-score
report = classification_report(test_labels,best_predictions,output_dict=True)
print("Classification Report:\n", report)

Classification Report:
 {'1004622': {'precision': 0.36363636363636365, 'recall': 0.6666666666666666, 'f1-score': 0.47058823529411764, 'support': 6.0}, '1010544': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2.0}, '1011491': {'precision': 1.0, 'recall': 0.041666666666666664, 'f1-score': 0.08, 'support': 24.0}, '1011831': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2.0}, '1012721': {'precision': 0.8695652173913043, 'recall': 0.9090909090909091, 'f1-score': 0.8888888888888888, 'support': 22.0}, '1013666': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9.0}, '1032361': {'precision': 0.6744186046511628, 'recall': 0.8055555555555556, 'f1-score': 0.7341772151898734, 'support': 36.0}, '1052221': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3.0}, '1066233': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4.0}, '1066515': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 4.0}, '1066551': {

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [38]:
# Debugging: Print the structure of the classification report
print("\nClassification Report Structure:\n", report)


Classification Report Structure:
 {'1004622': {'precision': 0.36363636363636365, 'recall': 0.6666666666666666, 'f1-score': 0.47058823529411764, 'support': 6.0}, '1010544': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2.0}, '1011491': {'precision': 1.0, 'recall': 0.041666666666666664, 'f1-score': 0.08, 'support': 24.0}, '1011831': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2.0}, '1012721': {'precision': 0.8695652173913043, 'recall': 0.9090909090909091, 'f1-score': 0.8888888888888888, 'support': 22.0}, '1013666': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9.0}, '1032361': {'precision': 0.6744186046511628, 'recall': 0.8055555555555556, 'f1-score': 0.7341772151898734, 'support': 36.0}, '1052221': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3.0}, '1066233': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4.0}, '1066515': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 4.0}, '

In [39]:
# Extract precision, recall, and F1-score
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")

Precision: 0.6211914567703127
Recall: 0.627522719225776
F1-Score: 0.5842009108781933


In [40]:
print(f"Best Model Accuracy: {best_accuracy}")
print(f"Best Model Cross-validation scores: {best_scores}")

Best Model Accuracy: 0.627522719225776
Best Model Cross-validation scores: [0.63372179 0.63622953 0.62879906 0.62717616 0.63411036]


### LogisticRegression

In [41]:
# Import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [42]:
# Extract features and labels
features = train_data['lemmatized_query']
labels = train_data['sku']

In [43]:
# Encode the target labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

In [44]:
# Split the data
train_features, test_features, train_labels_encoded, test_labels_encoded = train_test_split(
    features, labels_encoded, test_size=0.2, random_state=42)

In [45]:
# Vectorize the text data
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
train_features = vectorizer.fit_transform(train_features)
test_features = vectorizer.transform(test_features)

In [46]:
# Initialize the LogisticRegression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)

In [47]:
lr_model.fit(train_features, train_labels_encoded)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [48]:
# Make predictions with the best model
lr_predictions = lr_model.predict(test_features)
lr_accuracy = accuracy_score(test_labels_encoded,lr_predictions)
lr_scores = cross_val_score(lr_model, train_features, train_labels_encoded, cv=5)



In [49]:
print(f"LogisticRegression Model Accuracy: {lr_accuracy}")
print(f"LogisticRegression Model Cross-validation scores: {lr_scores}")

LogisticRegression Model Accuracy: 0.6242181045674495
LogisticRegression Model Cross-validation scores: [0.62708364 0.6266411  0.61965181 0.61965181 0.62643848]


In [50]:
# Ensure the classification report aligns with the predicted classes
unique_labels = np.unique(test_labels_encoded)

In [51]:
# Compute precision, recall, and F1-score
report = classification_report(test_labels_encoded, lr_predictions, labels=unique_labels, target_names=label_encoder.classes_[unique_labels], output_dict=True)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [52]:
# Debugging: Print the structure of the classification report
print("\nClassification Report Structure:\n", report)


Classification Report Structure:
 {np.int64(1004622): {'precision': 0.36363636363636365, 'recall': 0.6666666666666666, 'f1-score': 0.47058823529411764, 'support': 6.0}, np.int64(1010544): {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2.0}, np.int64(1011491): {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 24.0}, np.int64(1011831): {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2.0}, np.int64(1012721): {'precision': 0.8695652173913043, 'recall': 0.9090909090909091, 'f1-score': 0.8888888888888888, 'support': 22.0}, np.int64(1013666): {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9.0}, np.int64(1032361): {'precision': 0.7073170731707317, 'recall': 0.8055555555555556, 'f1-score': 0.7532467532467533, 'support': 36.0}, np.int64(1052221): {'precision': 0.3333333333333333, 'recall': 0.3333333333333333, 'f1-score': 0.3333333333333333, 'support': 3.0}, np.int64(1066233): {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4.

In [53]:
# Extract precision, recall, and F1-score
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")

Precision: 0.6283368009079913
Recall: 0.6242181045674495
F1-Score: 0.5771620926829761
