In [2]:
#import library
import pandas as pd
import re
import numpy as np
import nltk
from nltk.collocations import *
#from nltk.tokenize import RegexpTokenizer
#from nltk.tokenize import MWETokenizer
from nltk.stem import PorterStemmer
from itertools import chain
from nltk.probability import *
import nltk.data

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn import svm
from sklearn.metrics import classification_report

## 1. Read data

In [4]:
all_train=pd.read_csv("Training Dataset/train_data.csv")
all_label=pd.read_csv("Training Dataset/train_label.csv")

In [5]:
test_data = pd.read_csv("test_data.csv")

## 2. Feature Extractor
* split dataset to train data and test data to train model
* extract feature 


In [6]:
#split data
x_train,x_test,y_train,y_test = train_test_split(all_train['text'].tolist(),all_label['label'].tolist(),test_size =0.2)


For feature extracting, we try unigram and bigram with TF-IDF, then try unigram and bigram with word count, see the result and comopre the results, finally we find that the feature of the model with best perform is unigram and bigram with TF-IDF.


In [5]:
#unigram and bigram with TF-IDF
# extract feature 
tfidfs = TfidfVectorizer(analyzer='word', token_pattern="\w+(?:[-']\w+)?",ngram_range=(1, 2), lowercase=True,min_df=2)
#get its TF-IDF vector
x_array = tfidfs.fit_transform(x_train)  
x_array.shape

(520000, 1984179)

In [9]:
#transfer the label to vector
y_array=np.asarray(y_train)

## 3. Model linear SVM

In [7]:
clf = svm.LinearSVC()

In [8]:
#train the model by using training dataset
clf.fit(x_array,y_array)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [9]:
#get TF-IDF vector of test data in trainingdata to get prediction and compare prediction with y_test
x_test_array = tfidfs.transform(x_test)

In [17]:
x_test_array.shape

(130000, 1984179)

In [10]:
#prediction
y_pred=clf.predict(x_test_array)
#get vector of y_test
y_test = np.asarray(y_test)

In [11]:
#see the accuracy of model
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.72      0.78      0.75     25943
           2       0.55      0.53      0.54     26012
           3       0.56      0.52      0.54     26179
           4       0.56      0.55      0.55     25934
           5       0.72      0.76      0.74     25932

    accuracy                           0.63    130000
   macro avg       0.62      0.63      0.62    130000
weighted avg       0.62      0.63      0.62    130000



## 4.Predict the test data

In [12]:
#use the model to get prediction of the given test data
#get vector
pred_data = tfidfs.transform(test_data['text'].tolist())
#predict
y_pre = clf.predict(pred_data)

Output to csv

In [13]:
#transfer format of the prediction to 
pre_test = y_pre.tolist()

In [14]:
#add the prediction to test dataframe
pre_test = pd.Series(pre_test)
test_data = pd.concat([test_data, pre_test], axis=1)
test_data = test_data.rename(columns={0:'label'})

In [15]:
#the output result to csv
test_data.to_csv("predict label.csv", sep=',' ,index=False, encoding='utf-8')

## 5. Other features

After this, we have to change the features in svm model to see if there are different results.

## Bigram_unigram, countVector
we use the split train data do the same step of bigram_unigram with TF-IDF

In [20]:
vectorizer_all = CountVectorizer(analyzer='word', token_pattern="\w+(?:[-']\w+)?",ngram_range=(1, 2), lowercase=True, min_df=2) 

In [21]:
data_count_array = vectorizer_all.fit_transform(x_train)
print (data_count_array.shape)

(520000, 1984179)


In [22]:
clf2 = svm.LinearSVC()

In [23]:
clf2.fit(data_count_array,y_array)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [24]:
xcount_test_array = vectorizer_all.transform(x_test)
xcount_test_array.shape

(130000, 1984179)

In [25]:
y_count_pred = clf2.predict(xcount_test_array)

In [26]:
#see the accuracy of model
print(classification_report(y_test,y_count_pred))

              precision    recall  f1-score   support

           1       0.70      0.72      0.71     25943
           2       0.50      0.48      0.49     26012
           3       0.49      0.48      0.48     26179
           4       0.50      0.49      0.49     25934
           5       0.68      0.71      0.69     25932

    accuracy                           0.57    130000
   macro avg       0.57      0.57      0.57    130000
weighted avg       0.57      0.57      0.57    130000



## Thigram_bigram, TF-IDFVector
using the split train data do the same step of bigram_unigram with TF-IDF

In [27]:
tfidfs_23 = TfidfVectorizer(analyzer='word', token_pattern="\w+(?:[-']\w+)?",ngram_range=(2, 3), lowercase=True,min_df=2)
#get its TF-IDF vector
x_array_23 = tfidfs_23.fit_transform(x_train)  
x_array_23.shape

(520000, 6684484)

In [28]:
clf3 = svm.LinearSVC()

In [32]:
clf3.fit(x_array_23,y_array)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [33]:
x_test_array_23 = tfidfs_23.transform(x_test)
x_test_array_23.shape

(130000, 6684484)

In [34]:
#prediction
y_pred_23=clf3.predict(x_test_array_23)


In [35]:
#see the accuracy of model
print(classification_report(y_test,y_pred_23))

              precision    recall  f1-score   support

           1       0.71      0.79      0.75     25943
           2       0.56      0.53      0.55     26012
           3       0.57      0.52      0.54     26179
           4       0.56      0.55      0.56     25934
           5       0.71      0.76      0.73     25932

    accuracy                           0.63    130000
   macro avg       0.62      0.63      0.63    130000
weighted avg       0.62      0.63      0.63    130000



## Thigram_bigram, CountVector
using the split train data do the same step of bigram_unigram with TF-IDF

In [7]:
vectorizer_all2 = CountVectorizer(analyzer='word', token_pattern="\w+(?:[-']\w+)?",ngram_range=(2, 3), lowercase=True, min_df=3) 

In [8]:
data_count_array_23 = vectorizer_all2.fit_transform(x_train)
print (data_count_array_23.shape)

(520000, 3896389)


In [3]:
clf4 = svm.LinearSVC()

In [10]:
clf4.fit(data_count_array_23,y_array)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [11]:
xcount_test_array_23 = vectorizer_all2.transform(x_test)
xcount_test_array_23.shape

(130000, 3896389)

In [13]:
y_count_pred_23 = clf4.predict(xcount_test_array_23)

In [14]:
#see the accuracy of model
print(classification_report(y_test,y_count_pred_23))

              precision    recall  f1-score   support

           1       0.71      0.72      0.72     26221
           2       0.52      0.49      0.50     26051
           3       0.50      0.48      0.49     25996
           4       0.50      0.50      0.50     25649
           5       0.68      0.73      0.70     26083

    accuracy                           0.58    130000
   macro avg       0.58      0.58      0.58    130000
weighted avg       0.58      0.58      0.58    130000

