This is an assignment on Linear Regression from the course Machine Learning for NLP at the University Paris Cite.

In [1]:
# Importing necessary libraries:
from scipy import stats as st
import matplotlib.pyplot as plt
from math import *
import numpy as np
from time import time
from sklearn import linear_model, svm
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import GridSearchCV, KFold

In [2]:
# Read the dataset from file and store in lists of documents and classes:
def read_data(file):
    # Open the file and read the lines:
    data_stream = open(file)
    # Initialize the lists and dicts:
    X = []
    y = []
    while 1:
        line = data_stream.readline()
        line = line.strip()
        if not line:
            break
        # Split the line into columns:
        cols = line.split('\t')
        # Split the classes into a list to iterate over:
        classes = cols[0].split(',')
        for c in classes:
            # Add the document and its class to the lists:
            X.append(cols[1])
            y.append(c)
    return X, y

def class_convert(y_train, y_test):
    # Because I read data separately, I need to use another method to convert the classes to integers:
    # Initialize the dicts:
    c2i = {}
    class_id = 0
    train_id = []
    test_id = []
    for c in y_train:
        if c not in c2i:
            c2i[c] = class_id
            class_id += 1
        train_id.append(c2i[c])
    for c in y_test:
        if c not in c2i:
            c2i[c] = class_id
            class_id += 1
        test_id.append(c2i[c])
    return np.array(train_id), np.array(test_id)

In [3]:
# Read the train and test data:
X_train, y_train = read_data('medium.train.onedocperline')
X_test, y_test = read_data('medium.test.onedocperline')
y_train, y_test = class_convert(y_train, y_test)

In [4]:
# Vectorizing the datasets:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
# Fit_transform the train data and transform the test data:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [5]:
# Load the perceptron model:
clf = linear_model.Perceptron(shuffle = True, random_state = 42)
# Fit the model on the train data:
clf.fit(X_train, y_train)
# Score the model on the train and test data. Using accuracy_score returns the same results as clf.score:
print(f"The accuracy on the train set is: {accuracy_score(y_train, clf.predict(X_train)):.2%}")
print(f"The accuracy on the test set is: {accuracy_score(y_test, clf.predict(X_test)):.2%}")

The accuracy on the train set is: 76.41%
The accuracy on the test set is: 63.53%


In [6]:
# The accuracy is high, but it seems that the prediction is always the same class. Let's check:
print(y_test.shape)
print(y_train.shape)
print(y_test[0:48])
print(clf.predict(X_test)[0:48])

(255,)
(2535,)
[22  1 31 32 22  1 38  3 21 51 40  7 42 20  7 42 86 19 40  1  2 47 16 51
 35 36 16 17 62 22 38 30 29  1  3 13 12 15 15 35 36 22 37 68 79 16 69 15]
[22 39 31 31 93 93 93 93 93 93 93 42 42 20 19 19 19 19 19  1  1 27 36 16
 36 36 92 92 62 36 36 36 36 36 36 36 36 36 15 36 36 22 22 34 34 47 34 15]


In [7]:
# We can measure the precision of the model instead:
print(f"The precision on the train set is: {precision_score(y_train, clf.predict(X_train), average = 'macro', zero_division = 0):.2%}")
print(f"The precision on the test set is: {precision_score(y_test, clf.predict(X_test), average = 'macro', zero_division = 0):.2%}")

The precision on the train set is: 45.07%
The precision on the test set is: 26.08%


In [8]:
# Load the SVM model:
clf = svm.LinearSVC(random_state = 42)
clf.fit(X_train, y_train)
print(f"The accuracy on the train set is: {accuracy_score(y_train, clf.predict(X_train)):.2%}")
print(f"The accuracy on the test set is: {accuracy_score(y_test, clf.predict(X_test)):.2%}")

The accuracy on the train set is: 78.86%
The accuracy on the test set is: 70.98%


In [9]:
# We can measure the precision of the model instead:
print(f"The precision on the train set is: {precision_score(y_train, clf.predict(X_train), average = 'macro', zero_division = 0):.2%}")
print(f"The precision on the test set is: {precision_score(y_test, clf.predict(X_test), average = 'macro', zero_division = 0):.2%}")

The precision on the train set is: 41.43%
The precision on the test set is: 35.76%


In [12]:
# Use GridSearchCV to find the best parameters for the SVM model:
parameters = {'multi_class':('ovr', 'crammer_singer'), 'C':[1, 10]}
svc = svm.LinearSVC(random_state=42, max_iter=10000)
clf = GridSearchCV(svc, parameters)
 # type: ignoret0 = time()
clf.fit(X_train, y_train)




Done in 1681829160.658s


In [13]:
# Print the best parameters and the accuracy on the train and test sets:
print(f"The best parameters are: {clf.best_params_}")
print(f"The precision on the train set is: {precision_score(y_train, clf.predict(X_train), average = 'macro', zero_division = 0):.2%}")
print(f"The precision on the test set is: {precision_score(y_test, clf.predict(X_test), average = 'macro', zero_division = 0):.2%}")


The best parameters are: {'C': 10, 'multi_class': 'ovr'}
The precision on the train set is: 42.16%
The precision on the test set is: 35.86%


In [15]:
# Using cross validation:
parameters = {'multi_class':('ovr', 'crammer_singer'), 'C':[1, 10]}
svc = svm.LinearSVC(random_state=42)
clf = GridSearchCV(svc, parameters, scoring = 'accuracy', cv = KFold(n_splits = 5, shuffle = True, random_state = 42))
clf.fit(X_train, y_train)




Done in 101.860s




In [16]:
# Print the best parameters and the accuracy:
print(f"The best parameters are: {clf.best_params_}")
print(f"The precision on the train set is: {precision_score(y_train, clf.predict(X_train), average = 'macro', zero_division = 0):.2%}")
print(f"The precision on the test set is: {precision_score(y_test, clf.predict(X_test), average = 'macro', zero_division = 0):.2%}")

The best parameters are: {'C': 1, 'multi_class': 'crammer_singer'}
The precision on the train set is: 43.36%
The precision on the test set is: 34.87%
