In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

import time
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

In [None]:
#import stop words, data and do some pre processing
stop_words = set(stopwords.words('english'))
with open('/content/stopwords.txt') as f:
    for line in f:
        stop_words.add(line[:-1])
stop_words = list(stop_words)
#Read the csv file and change the encoding, remove tags,lower them
df_train = pd.read_csv('/content/drive/MyDrive/bigdata2023-exercise1-classification/train.csv', encoding='utf-8')
df_train['Title'] = df_train['Title'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_train['Content'] = df_train['Content'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_train['Label'] = df_train['Label'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')

#make a new column as a combination of title & content 
df_train['Combined']  = 3*(df_train['Title'] + ' ')  + df_train['Content'] 

In [None]:
#initilizations
le = preprocessing.LabelEncoder() # encode labels with a value between 0 and n_classes-1 
y = le.fit_transform(df_train['Label'])
clf = LinearSVC(random_state=42, tol=1e-5) #implement Linear Support Vector Machine

In [None]:
#create vectorizer
vectorizer = TfidfVectorizer(stop_words=stop_words) #converts a collection of raw documents into a matrix of TF-IDF features and removing stopwords
X = vectorizer.fit_transform(df_train['Combined'])

In [11]:
#Read the csv file and change the encoding
df_test = pd.read_csv('/content/drive/MyDrive/bigdata2023-exercise1-classification/test_without_labels.csv', encoding='utf-8')
df_test['Title'] = df_test['Title'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_test['Content'] = df_test['Content'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_test['Combined']  = 3*(df_test['Title'] + ' ')  + df_test['Content']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Running on test set...
Training Linear SVM...
Finished training...


In [None]:
#Training Linear SVM
clf.fit(X, y)

In [None]:
X = vectorizer.transform(df_test['Combined'])
#predict
predictions = clf.predict(X)
predictions = le.inverse_transform(predictions)

In [None]:
result = pd.DataFrame({'Id':df_test['Id'],'Predicted':predictions})
result.to_csv('testSet_categories.csv', sep=',', index=False)