# Part 2: Feature Construction, Classification and Evaluation

## Import of packages

In [2]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import scipy.stats as st
from sklearn.dummy import DummyClassifier
from scipy.stats import ttest_rel

## Define function and data loading

In [3]:
def read_data(path):
    # Create an empty DataFrame to store the data, with two fields: the text and the category
    data = pd.DataFrame(index=[], columns=['text','category'])
    # Lists the content of a directory; we use two loops because or files are stored into subdirectories
    i = 0
    for cat in os.listdir(path):
        for f in os.listdir(os.path.join(path, cat)):    
            content = open(os.path.join(path, cat, f)).read()
            data.loc[i] = [content, cat]
            i = i + 1
    return data

os.chdir('C:\\Users\\Marie.Zamecnikova\\OneDrive - Ecorys\\Desktop\\Data\\Data')

## Use of training data

In [4]:
train_data = read_data('train')
print(train_data.shape)

train_data

(2464, 2)


Unnamed: 0,text,category
0,<HEAD>\n<TITLE>CSE 121/131 Home Page</TITLE>\n...,course
1,"Date: Tue, 26 Nov 1996 19:10:40 GMT\nServer: N...",course
2,"Date: Tue, 26 Nov 1996 19:12:30 GMT\nServer: N...",course
3,"Date: Tuesday, 26-Nov-96 19:09:20 GMT\nServer:...",course
4,"Date: Tuesday, 26-Nov-96 19:07:04 GMT\nServer:...",course
...,...,...
2459,"Date: Tue, 26 Nov 1996 03:44:55 GMT\nServer: N...",student
2460,"Date: Tue, 26 Nov 1996 03:44:12 GMT\nServer: N...",student
2461,"Date: Tue, 26 Nov 1996 03:43:32 GMT\nServer: N...",student
2462,"Date: Tue, 26 Nov 1996 03:43:51 GMT\nServer: N...",student


## Cleaning Data 

In [5]:
# remove HTML mark up
train_data['extracted'] = train_data['text'].apply(lambda s: BeautifulSoup(s).text)

# remove other symbols 
train_data.replace({'-':'',':':'',',':''}, regex=True)

del train_data['text']

#rename col 
train_data.rename(columns = {'extracted':'text',}, inplace = True)

train_data

Unnamed: 0,category,text
0,course,\nCSE 121/131 Home Page\n\n\n CSE 121/131 Home...
1,course,"Date: Tue, 26 Nov 1996 19:10:40 GMT\nServer: N..."
2,course,"Date: Tue, 26 Nov 1996 19:12:30 GMT\nServer: N..."
3,course,"Date: Tuesday, 26-Nov-96 19:09:20 GMT\nServer:..."
4,course,"Date: Tuesday, 26-Nov-96 19:07:04 GMT\nServer:..."
...,...,...
2459,student,"Date: Tue, 26 Nov 1996 03:44:55 GMT\nServer: N..."
2460,student,"Date: Tue, 26 Nov 1996 03:44:12 GMT\nServer: N..."
2461,student,"Date: Tue, 26 Nov 1996 03:43:32 GMT\nServer: N..."
2462,student,"Date: Tue, 26 Nov 1996 03:43:51 GMT\nServer: N..."


## Feature Engineering 

### bag of words, TF-IDF and naive Bayes 

In [6]:
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(train_data['text'])

X_train_counts = CountVectorizer().fit_transform(train_data['text'])

df = pd.DataFrame.sparse.from_spmatrix(X_train_counts)
df.head()

transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = transformer.transform(X_train_counts)
X_train_tfidf.shape

clf = MultinomialNB().fit(X_train_tfidf, train_data['category'])

test_data = read_data('train')

X_test_counts = count_vect.transform(test_data['text'])
X_test_tfidf = transformer.transform(X_test_counts)

accuracy = clf.score(X_test_tfidf, test_data['category'])
print("accurcy:", accuracy)

y_pred = clf.predict(X_test_tfidf)
precision = precision_score(test_data['category'], y_pred, average='macro')
print("precision:", precision)


accurcy: 0.698051948051948
precision: 0.8545969948435784


## Models

### Encoding Data for Cross Validation Testing

In [7]:
le = LabelEncoder()
le.fit(train_data['category'])
pd.Series(le.transform(train_data['category'])).head()

categorical = [c for c in train_data.columns if train_data.dtypes[c] == 'object']

for c in categorical:
    train_data[c] = le.fit_transform(train_data[c])
    
train_data.head()

X = train_data.drop(['category'], axis=1)
y = train_data['text'].astype(int)

X = train_data.drop(['category'], axis=1)

indexes = [train_data.dtypes[c] == 'object' for c in X.columns]

enc = ColumnTransformer([
    ('onehot', OneHotEncoder(sparse=False), indexes)
], remainder='passthrough')

enc.fit(X)
X = enc.transform(X)
X = pd.DataFrame(X)

### Baseline model 

In [8]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, train_data['category'])
DummyClassifier(strategy='most_frequent')
dummy_clf.predict(X)

accuracies_dummy_t = cross_val_score(dummy_clf, X, train_data['category'], cv=10)
print(accuracies_dummy_t)


confidence_baseline_model = st.t.interval(alpha=0.95, df=len(accuracies_dummy_t)-1, loc=np.mean(accuracies_dummy_t), scale=st.sem(accuracies_dummy_t)) 

print(confidence_baseline_model)

[0.44534413 0.44534413 0.44534413 0.44534413 0.44308943 0.44308943
 0.44308943 0.44308943 0.44308943 0.44308943]
(0.4431584050173491, 0.4448242156995464)


#### Ridge Classifier model

In [9]:
liner_clf = RidgeClassifier()
accuracies_liner_clf = cross_val_score(liner_clf, X, train_data['category'], cv=10)
print(accuracies_liner_clf)

confidence_liner_model = st.t.interval(alpha=0.95, df=len(accuracies_liner_clf)-1, loc=np.mean(accuracies_liner_clf), scale=st.sem(accuracies_liner_clf)) 

print(confidence_liner_model)

[0.6437247  0.54251012 0.52226721 0.56275304 0.6504065  0.65853659
 0.48780488 0.51626016 0.30487805 0.59756098]
(0.4736544896484795, 0.6236859533915782)


#### KNeighbors Classifier model

In [10]:
knn = KNeighborsClassifier(n_neighbors = 5)
accuracies_KNeighbors = cross_val_score(knn, X, train_data['category'], cv=10)
print(accuracies_KNeighbors)

confidence_KNeighbors_model = st.t.interval(alpha=0.95, df=len(accuracies_KNeighbors)-1, loc=np.mean(accuracies_KNeighbors), scale=st.sem(accuracies_KNeighbors))
print(confidence_KNeighbors_model)

[0.76923077 0.8097166  0.63562753 0.48178138 0.79674797 0.85365854
 0.82520325 0.66260163 0.6504065  0.80487805]
(0.6451201831847084, 0.8128502588678901)


### Decision Tree Classifier

In [11]:
dt = DecisionTreeClassifier()
accuracies_DecisionTree = cross_val_score(dt, X, train_data['category'], cv=10)
print(accuracies_DecisionTree)

confidence_DecisionTree_model = st.t.interval(alpha=0.95, df=len(accuracies_DecisionTree)-1, loc=np.mean(accuracies_DecisionTree), scale=st.sem(accuracies_DecisionTree)) 

print(confidence_DecisionTree_model)

[0.72469636 0.8097166  0.70445344 0.54251012 0.82113821 0.91869919
 0.85772358 0.67073171 0.67886179 0.81300813]
(0.6752125506540018, 0.833095273314926)


## Evaluation

### T tests Results 

In [12]:

Decision_TreeVSKNeighbors = ttest_rel(accuracies_DecisionTree, accuracies_KNeighbors)

DummyVSKNeighbors =  ttest_rel(accuracies_dummy_t, accuracies_KNeighbors)

Ridge_ClassifierVSKNeighbors =  ttest_rel(accuracies_liner_clf, accuracies_KNeighbors)

Decision_TreeVSRidge_Classifier =  ttest_rel(accuracies_DecisionTree, accuracies_liner_clf)

Decision_TreeVSDummy=  ttest_rel(accuracies_DecisionTree, accuracies_dummy_t)

DummyVSRidge_Classifier =   ttest_rel(accuracies_dummy_t, accuracies_liner_clf)

print('Decision_Tree VS KNeighbors', Decision_TreeVSKNeighbors)
print('Dummy VS KNeighbors', DummyVSKNeighbors,)
print('Ridge_Classifier VS KNeighbors', Ridge_ClassifierVSKNeighbors)
print('Decision_Tree VS Ridge_Classifier',Decision_TreeVSRidge_Classifier) 
print('Decision_Tree VS Dummy', Decision_TreeVSDummy) 
print('Dummy VS Ridge_Classifier', DummyVSRidge_Classifier)

Decision_Tree VS KNeighbors Ttest_relResult(statistic=2.2871901325754562, pvalue=0.04799474716821933)
Dummy VS KNeighbors Ttest_relResult(statistic=-7.65640355946695, pvalue=3.1378711660269563e-05)
Ridge_Classifier VS KNeighbors Ttest_relResult(statistic=-4.595925703284578, pvalue=0.001298029342901032)
Decision_Tree VS Ridge_Classifier Ttest_relResult(statistic=5.3425306592312145, pvalue=0.00046691586930549855)
Decision_Tree VS Dummy Ttest_relResult(statistic=8.844881783230347, pvalue=9.8407506669272e-06)
Dummy VS Ridge_Classifier Ttest_relResult(statistic=-3.1619988433576482, pvalue=0.011513146536767622)


## CSV file

In [18]:
import pandas as pd 
import numpy as np

header = ['Decision Tree', 'KNeighbors', 'Ridge', 'Base line']

a = np.asarray([ [0.6752125506540018,0.833095273314926,0.81300813], [0.6451201831847084,0.8128502588678901,0.80487805], [0.4736544896484795,0.6236859533915782,0.59756098], [0.4431584050173491, 0.4448242156995464, 0.44308943] ])
pd.DataFrame(a).to_csv('featurematrix.csv')