In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
pd.set_option('display.max_colwidth', None)

test_data = pd.read_csv('sentiment-topic-test.tsv', sep='\t')

print(test_data.head())
print(test_data.columns)


   sentence_id  \
0            0   
1            1   
2            2   
3            3   
4            4   

                                                                                sentence  \
0            The stadium was alive with the roar of the crowd after that incredible win.   
1       That last-minute goal had me jumping out of my seat—what an unbelievable finish!   
2                      I couldn’t put the book down; it swept me into a whole new world.   
3          The story had its moments, though some parts felt like they dragged on a bit.   
4  I enjoyed the way the timelines shifted, even if it got a little confusing sometimes.   

  sentiment   topic  
0  positive  sports  
1  positive  sports  
2  positive    book  
3   neutral    book  
4   neutral    book  
Index(['sentence_id', 'sentence', 'sentiment', 'topic'], dtype='object')


In [None]:
data = pd.read_csv('train.csv', sep=';')  # replace with your actual filename
print(data.head())

X = data['sentence']
y = data['topic']

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_dev_vec = vectorizer.transform(X_dev)

svm = LinearSVC(C=1.0) 
svm.fit(X_train_vec, y_train)

y_dev_pred = svm.predict(X_dev_vec)
print("Validation performance:")
print(classification_report(y_dev, y_dev_pred))


   sentence_id  \
0            0   
1            1   
2            2   
3            3   
4            4   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(
    estimator=LinearSVC(max_iter=5000),  
    param_grid=param_grid,
    scoring='f1_macro', 
    cv=5,
    n_jobs=-1
)

grid.fit(X_train_vec, y_train)

print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Score:", grid.best_score_)

best_model = grid.best_estimator_
y_dev_pred = best_model.predict(X_dev_vec)

print("Validation Set Performance:")
print(classification_report(y_dev, y_dev_pred))


Best Parameters: {'C': 0.1}
Best Cross-Validation Score: 0.9839361379109844
Validation Set Performance:
              precision    recall  f1-score   support

        book       0.98      0.97      0.98      1200
       movie       0.99      0.98      0.99      1200
      sports       0.97      1.00      0.99      1200

    accuracy                           0.98      3600
   macro avg       0.98      0.98      0.98      3600
weighted avg       0.98      0.98      0.98      3600



In [None]:
X_test = test_data['sentence']
y_test = test_data['topic']

X_test_vec = vectorizer.transform(X_test)

y_test_pred = svm.predict(X_test_vec)

from sklearn.metrics import classification_report
print("Test set performance:")
print(classification_report(y_test, y_test_pred))


Test set performance:
              precision    recall  f1-score   support

        book       1.00      0.83      0.91         6
       movie       0.75      0.50      0.60         6
      sports       0.67      1.00      0.80         6

    accuracy                           0.78        18
   macro avg       0.81      0.78      0.77        18
weighted avg       0.81      0.78      0.77        18



In [None]:
misclassified_mask = y_test != y_test_pred
misclassified_sentences = X_test[misclassified_mask]
true_labels = y_test[misclassified_mask]
pred_labels = y_test_pred[misclassified_mask]

error_df = pd.DataFrame({
    'sentence': misclassified_sentences,
    'true_label': true_labels,
    'predicted_label': pred_labels
})

print(error_df)


                                                                                    sentence  \
0               The trailer was decent, giving you a taste without spilling all the secrets.   
1  I found the main character so annoying that it was hard to care about what happened next.   
2    Word is, the screenplay was scribbled down during a casual coffee break at a busy cafe.   
3      It’s surprising how a promising start could unravel into such a disappointing finish.   

  true_label predicted_label  
0      movie          sports  
1       book           movie  
2      movie          sports  
3      movie          sports  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_dev_vec = vectorizer.transform(X_dev)

svm = LinearSVC(C=1.0) 
svm.fit(X_train_vec, y_train)

y_dev_pred = svm.predict(X_dev_vec)
print("Validation performance:")
print(classification_report(y_dev, y_dev_pred))

Validation performance:
              precision    recall  f1-score   support

        book       0.98      0.97      0.98      1200
       movie       1.00      0.98      0.99      1200
      sports       0.97      1.00      0.98      1200

    accuracy                           0.98      3600
   macro avg       0.98      0.98      0.98      3600
weighted avg       0.98      0.98      0.98      3600



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Parameter grid with only C
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

# Grid search setup
grid = GridSearchCV(
    estimator=LinearSVC(max_iter=5000), 
    param_grid=param_grid,
    scoring='f1_macro', 
    cv=5,
    n_jobs=-1
)

grid.fit(X_train_vec, y_train)

print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Score:", grid.best_score_)

best_model = grid.best_estimator_
y_dev_pred = best_model.predict(X_dev_vec)

print("Validation Set Performance:")
print(classification_report(y_dev, y_dev_pred))


Best Parameters: {'C': 1}
Best Cross-Validation Score: 0.9848440564863873
Validation Set Performance:
              precision    recall  f1-score   support

        book       0.98      0.97      0.98      1200
       movie       1.00      0.98      0.99      1200
      sports       0.97      1.00      0.98      1200

    accuracy                           0.98      3600
   macro avg       0.98      0.98      0.98      3600
weighted avg       0.98      0.98      0.98      3600



In [None]:
X_test = test_data['sentence']
y_test = test_data['topic']

X_test_vec = vectorizer.transform(X_test)

y_test_pred = svm.predict(X_test_vec)

from sklearn.metrics import classification_report
print("Test set performance:")
print(classification_report(y_test, y_test_pred))


Test set performance:
              precision    recall  f1-score   support

        book       1.00      0.83      0.91         6
       movie       0.75      0.50      0.60         6
      sports       0.67      1.00      0.80         6

    accuracy                           0.78        18
   macro avg       0.81      0.78      0.77        18
weighted avg       0.81      0.78      0.77        18



In [None]:
misclassified_mask = y_test != y_test_pred
misclassified_sentences = X_test[misclassified_mask]
true_labels = y_test[misclassified_mask]
pred_labels = y_test_pred[misclassified_mask]



In [None]:
error_df = pd.DataFrame({
    'sentence': misclassified_sentences,
    'true_label': true_labels,
    'predicted_label': pred_labels
})

print(error_df)


                                                                                    sentence  \
0               The trailer was decent, giving you a taste without spilling all the secrets.   
1  I found the main character so annoying that it was hard to care about what happened next.   
2    Word is, the screenplay was scribbled down during a casual coffee break at a busy cafe.   
3      It’s surprising how a promising start could unravel into such a disappointing finish.   

  true_label predicted_label  
0      movie          sports  
1       book           movie  
2      movie          sports  
3      movie          sports  


In [44]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb_model = MultinomialNB(alpha=1) 
nb_model.fit(X_train_vec, y_train)

y_dev_pred = nb_model.predict(X_dev_vec)
print("Validation set performance:")
print(classification_report(y_dev, y_dev_pred))

Validation set performance:
              precision    recall  f1-score   support

        book       0.92      0.97      0.94      1200
       movie       0.92      0.99      0.95      1200
      sports       1.00      0.86      0.93      1200

    accuracy                           0.94      3600
   macro avg       0.94      0.94      0.94      3600
weighted avg       0.94      0.94      0.94      3600



In [None]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 5.0]
}

grid_nb = GridSearchCV(
    MultinomialNB(),
    param_grid=param_grid,
    scoring='f1_macro', 
    cv=5,
    n_jobs=-1
)

grid_nb.fit(X_train_vec, y_train)

print("Best alpha:", grid_nb.best_params_['alpha'])
print("Best cross-val score:", grid_nb.best_score_)

y_dev_pred = grid_nb.best_estimator_.predict(X_dev_vec)
print("Dev set performance:")
print(classification_report(y_dev, y_dev_pred))

Best alpha: 1.0
Best cross-val score: 0.9393791112942959
Dev set performance:
              precision    recall  f1-score   support

        book       0.92      0.97      0.94      1200
       movie       0.92      0.99      0.95      1200
      sports       1.00      0.86      0.93      1200

    accuracy                           0.94      3600
   macro avg       0.94      0.94      0.94      3600
weighted avg       0.94      0.94      0.94      3600



In [None]:
y_test_pred = nb_model.predict(X_test_vec)

print("Test set performance:")
print(classification_report(y_test, y_test_pred))

Test set performance:
              precision    recall  f1-score   support

        book       0.71      0.83      0.77         6
       movie       0.80      0.67      0.73         6
      sports       1.00      1.00      1.00         6

    accuracy                           0.83        18
   macro avg       0.84      0.83      0.83        18
weighted avg       0.84      0.83      0.83        18



In [48]:
misclassified = X_test[y_test != y_test_pred]
true_labels = y_test[y_test != y_test_pred]
pred_labels = y_test_pred[y_test != y_test_pred]

error_df = pd.DataFrame({
    'sentence': misclassified,
    'true_label': true_labels,
    'predicted_label': pred_labels
})
print(error_df)


                                                                                         sentence  \
3                   The story had its moments, though some parts felt like they dragged on a bit.   
6   The movie was a wild ride from start to finish; I was on the edge of my seat the entire time.   
16          It’s surprising how a promising start could unravel into such a disappointing finish.   

   true_label predicted_label  
3        book           movie  
6       movie            book  
16      movie            book  
