In [1]:
import pandas

In [2]:
import numpy

In [3]:
train_path = "Genre Classification Dataset/train_data.txt"
train_data = pandas.read_csv(train_path, sep=':::', names=['Title', 'Genre', 'Description'], engine='python')
train_data

test_path = "Genre Classification Dataset/test_data.txt"
test_data = pandas.read_csv(test_path, sep=':::', names=['Id', 'Title', 'Description'], engine='python')
test_data

Unnamed: 0,Id,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [4]:
print(train_data)

                                              Title          Genre  \
1                     Oscar et la dame rose (2009)          drama    
2                                     Cupid (1997)       thriller    
3                 Young, Wild and Wonderful (1980)          adult    
4                            The Secret Sin (1915)          drama    
5                           The Unrecovered (2007)          drama    
...                                             ...            ...   
54210                              "Bonino" (1953)         comedy    
54211                  Dead Girls Don't Cry (????)         horror    
54212    Ronald Goedemondt: Ze bestaan echt (2008)    documentary    
54213                     Make Your Own Bed (1944)         comedy    
54214   Nature's Fury: Storm of the Century (2006)        history    

                                             Description  
1       Listening in to a conversation between his do...  
2       A brother and sister with a past 

In [5]:
test_solution_path = "Genre Classification Dataset/test_data_solution.txt"
test_solution_data = pandas.read_csv(test_solution_path, sep=':::', names=['Id', 'Title','Genre', 'Description'], engine='python')
test_solution_data

Unnamed: 0,Id,Title,Genre,Description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),adult,A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard..."


In [6]:
print(train_data)

                                              Title          Genre  \
1                     Oscar et la dame rose (2009)          drama    
2                                     Cupid (1997)       thriller    
3                 Young, Wild and Wonderful (1980)          adult    
4                            The Secret Sin (1915)          drama    
5                           The Unrecovered (2007)          drama    
...                                             ...            ...   
54210                              "Bonino" (1953)         comedy    
54211                  Dead Girls Don't Cry (????)         horror    
54212    Ronald Goedemondt: Ze bestaan echt (2008)    documentary    
54213                     Make Your Own Bed (1944)         comedy    
54214   Nature's Fury: Storm of the Century (2006)        history    

                                             Description  
1       Listening in to a conversation between his do...  
2       A brother and sister with a past 

In [7]:
!Pip install scikit-learn



In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Step 1: Load the data
# Assuming train_data and test_solution_data are already loaded as DataFrame
# train_data = pd.read_csv('path_to_train_data.csv')
# test_solution_data = pd.read_csv('path_to_test_solution_data.csv')

# Step 2: Extract features and labels
X_train = train_data['Description']
y_train = train_data['Genre']
X_test = test_solution_data['Description']
y_test = test_solution_data['Genre']

# Step 3: Tokenization and TF-IDF Vectorization (unigram and bigram)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 4: Label Encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Step 5: Feature Selection
selector = SelectKBest(chi2, k=1000)
X_train_selected = selector.fit_transform(X_train_tfidf, y_train_encoded)
X_test_selected = selector.transform(X_test_tfidf)

# Step 6: Scaling
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Step 7: Train SGD Classifier with hyperparameter tuning
parameters = {
    'loss': ['log', 'hinge'],  # hinge is for SVM-like behavior, log for logistic regression
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'max_iter': [1000],
    'tol': [1e-3]
}

sgd_clf = SGDClassifier()
grid_search = GridSearchCV(sgd_clf, parameters, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_encoded)

best_sgd_clf = grid_search.best_estimator_

# Step 8: Apply model on scaled train and test data
y_train_pred = best_sgd_clf.predict(X_train_scaled)
y_test_pred = best_sgd_clf.predict(X_test_scaled)

# Step 9: Calculate accuracy, precision, recall, confusion matrix
accuracy_train = accuracy_score(y_train_encoded, y_train_pred)
accuracy_test = accuracy_score(y_test_encoded, y_test_pred)

precision_train = precision_score(y_train_encoded, y_train_pred, average='weighted')
precision_test = precision_score(y_test_encoded, y_test_pred, average='weighted')

recall_train = recall_score(y_train_encoded, y_train_pred, average='weighted')
recall_test = recall_score(y_test_encoded, y_test_pred, average='weighted')

conf_matrix_train = confusion_matrix(y_train_encoded, y_train_pred)
conf_matrix_test = confusion_matrix(y_test_encoded, y_test_pred)

# Print the results
print("Train Accuracy:", accuracy_train)
print("Test Accuracy:", accuracy_test)
print("Train Precision:", precision_train)
print("Test Precision:", precision_test)
print("Train Recall:", recall_train)
print("Test Recall:", recall_test)
print("Train Confusion Matrix:\n", conf_matrix_train)
print("Test Confusion Matrix:\n", conf_matrix_test)


60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
23 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\HP\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\HP\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\HP\AppData\Local\Programs\Python\Python312\Lib\site-packag

Train Accuracy: 0.4900210277788025
Test Accuracy: 0.4739667896678967
Train Precision: 0.5096801460376954
Test Precision: 0.4547310204026629
Train Recall: 0.4900210277788025
Test Recall: 0.4739667896678967
Train Confusion Matrix:
 [[  103     3     2     0     0    62     3   370   593     0     0     0
      0    73     1     0     0     0     2     0    15     6    11     0
     31     0    40]
 [    0   153    43     0     0    37     0    78   254     1     0     1
      0     7     2     0     0     0     0     0     0     4     0     3
      1     0     6]
 [    9    20   111     0     0    57     0   228   269     1     2     0
      0    41     0     0     0     0     1     0     6     6     2     2
      1     0    19]
 [    5     1     0    47     0    73     0   146   153     3     0     0
      0    25     6     0     0     0     2     0    18     8     1     0
      1     0     9]
 [    1     0     0     0    16     9     0   162    65     0     0     0
      0     2     1 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
