In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
import spacy
import string
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
file_path = 'Dataset.csv'
train_data = pd.read_csv(file_path)

NameError: name 'pd' is not defined

In [None]:
train_data.replace("data missing", np.nan, inplace=True)
X = train_data.drop(columns=['category', 'par_id', 'lexicon_count', 'difficult_words', 'last_editor_gender', 'text_clarity'])
y = train_data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(y_train)

6443                             biographies
5846                             biographies
7104                             programming
251                              biographies
8606                             biographies
                        ...                 
5734    movies about artificial intelligence
5191                              philosophy
5390                             biographies
860                              biographies
7270                             programming
Name: category, Length: 7477, dtype: object


In [None]:
imputer = SimpleImputer(strategy='most_frequent')

# Reshape y_train and y_test to a 2D array
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

# Fit the imputer on y_train
imputer.fit(y_train)

# Transform y_train and y_test
y_train_imputed = imputer.transform(y_train)
y_test_imputed = imputer.transform(y_test)

# Flatten the arrays back to 1D
y_train_imputed = y_train_imputed.flatten()
y_test_imputed = y_test_imputed.flatten()

# Convert back to pandas Series if needed
y_train_imputed = pd.Series(y_train_imputed, name='category')
y_test_imputed = pd.Series(y_test_imputed, name='category')


In [None]:
print(y_train_imputed)

0                                biographies
1                                biographies
2                                programming
3                                biographies
4                                biographies
                        ...                 
7472    movies about artificial intelligence
7473                              philosophy
7474                             biographies
7475                             biographies
7476                             programming
Name: category, Length: 7477, dtype: object


In [None]:
y_train_imputed.replace({
    'artificial intelligence': 'Artificial intelligence',
    'biography': 'Biography',
    'movies about artificial intelligence': 'Movies about artificial intelligence',
    'philosophy': 'Philosophy',
    'programming': 'Programming'
}, inplace=True)

In [None]:
print(np.unique(y_train_imputed))

['Artificial intelligence' 'Biographies'
 'Movies about artificial intelligence' 'Philosophy' 'Programming'
 'biographies']


In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
y_train_encoded = encoder.fit_transform(y_train_imputed.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test_imputed.values.reshape(-1, 1))
print(y_train_encoded)

[[0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]]




In [None]:
# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = imputer.fit_transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)

X_test_imputed = imputer.transform(X_test)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns)


In [None]:
encoder = OneHotEncoder(sparse=False, drop='first')
X_train_encoded = encoder.fit_transform(X_train[['has_entity']])
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(['has_entity']))

X_test_encoded = encoder.transform(X_test[['has_entity']])
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(['has_entity']))

# Concatenate the encoded DataFrames with the original DataFrames
X_train_final = pd.concat([X_train_imputed, X_train_encoded_df], axis=1)
X_test_final = pd.concat([X_test_imputed, X_test_encoded_df], axis=1)
print(X_train_final.head(10))


                                           paragraph                      has_entity  \
0  Extension of the Bank to the north-west, the e...   ORG_YES_PRODUCT_NO_PERSON_NO_   
1  Thomson's separation of neon isotopes by their...  ORG_YES_PRODUCT_NO_PERSON_YES_   
2  The Python License was an open-source, GPL-com...   ORG_YES_PRODUCT_NO_PERSON_NO_   
3   Now let's be clear before we go any further t...  ORG_YES_PRODUCT_NO_PERSON_YES_   
4  A General System of Botany, Descriptive and An...   ORG_NO_PRODUCT_NO_PERSON_YES_   
5  The Bahaʼi Faith asserts that evil is non-exis...   ORG_YES_PRODUCT_NO_PERSON_NO_   
6  In August 1993, it was discovered that the pro...  ORG_YES_PRODUCT_NO_PERSON_YES_   
7  Roko's basilisk has gained a significant amoun...  ORG_YES_PRODUCT_NO_PERSON_YES_   
8  1675 – Some Considerations about the Reconcile...  ORG_YES_PRODUCT_NO_PERSON_YES_   
9  For nearly 30 years Ehrenberg examined samples...  ORG_YES_PRODUCT_NO_PERSON_YES_   

   has_entity_ORG_NO_PRODUCT_NO



In [None]:
nlp = spacy.load('en_core_web_sm')
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens
X_train_final['tokenized']=X_train_final['paragraph'].apply(tokenize_text)
X_test_final['tokenized']=X_test_final['paragraph'].apply(tokenize_text)

print(X_train_final.head(10))
print(X_test_final.head(10))

                                           paragraph                      has_entity  \
0  Extension of the Bank to the north-west, the e...   ORG_YES_PRODUCT_NO_PERSON_NO_   
1  Thomson's separation of neon isotopes by their...  ORG_YES_PRODUCT_NO_PERSON_YES_   
2  The Python License was an open-source, GPL-com...   ORG_YES_PRODUCT_NO_PERSON_NO_   
3   Now let's be clear before we go any further t...  ORG_YES_PRODUCT_NO_PERSON_YES_   
4  A General System of Botany, Descriptive and An...   ORG_NO_PRODUCT_NO_PERSON_YES_   
5  The Bahaʼi Faith asserts that evil is non-exis...   ORG_YES_PRODUCT_NO_PERSON_NO_   
6  In August 1993, it was discovered that the pro...  ORG_YES_PRODUCT_NO_PERSON_YES_   
7  Roko's basilisk has gained a significant amoun...  ORG_YES_PRODUCT_NO_PERSON_YES_   
8  1675 – Some Considerations about the Reconcile...  ORG_YES_PRODUCT_NO_PERSON_YES_   
9  For nearly 30 years Ehrenberg examined samples...  ORG_YES_PRODUCT_NO_PERSON_YES_   

   has_entity_ORG_NO_PRODUCT_NO

In [None]:
X_train_final['tokenized'] = X_train_final['tokenized'].apply(lambda tokens: [token.lower() for token in tokens])
X_test_final['tokenized'] = X_test_final['tokenized'].apply(lambda tokens: [token.lower() for token in tokens])
print(X_train_final['tokenized'].head(10))
print(X_test_final['tokenized'].head(10))

0    [extension, of, the, bank, to, the, north, -, ...
1    [thomson, 's, separation, of, neon, isotopes, ...
2    [the, python, license, was, an, open, -, sourc...
3    [ , now, let, 's, be, clear, before, we, go, a...
4    [a, general, system, of, botany, ,, descriptiv...
5    [the, bahaʼi, faith, asserts, that, evil, is, ...
6    [in, august, 1993, ,, it, was, discovered, tha...
7    [roko, 's, basilisk, has, gained, a, significa...
8    [1675, –, some, considerations, about, the, re...
9    [for, nearly, 30, years, ehrenberg, examined, ...
Name: tokenized, dtype: object
0    [in, 1896, ,, fitzgerald, and, john, perry, ob...
1    [ , ., for, a, more, detailed, derivation, and...
2    [ , therefore, ,, having, minimal, patient, da...
3    [product, stewardship, includes, waste, dispos...
4    [recursive, allocatable, components, –, as, an...
5    [george, gaylord, simpson,  , was, an, america...
6    [in, some, assembly, languages,  , the, same, ...
7    [that, included, a, £, 40, st

In [None]:
nlp = spacy.load('en_core_web_sm')
def remove_stop_words(tokens):
    doc = nlp(" ".join(tokens))  # Join tokens into a string and process with spaCy
    filtered_tokens = [token.text for token in doc if not token.is_stop]
    return filtered_tokens
X_train_final['tokenized'] = X_train_final['tokenized'].apply(remove_stop_words)
X_test_final['tokenized'] = X_test_final['tokenized'].apply(remove_stop_words)

In [None]:
print(X_train_final['tokenized'].head(10))
print(X_test_final['tokenized'].head(10))

0    [extension, bank, north, -, west, ,, exterior,...
1    [thomson, separation, neon, isotopes, mass, ex...
2    [python, license, open, -, source, ,, gpl, -, ...
3    [  , let, clear, tom, kilburn, knew, thing, co...
4    [general, system, botany, ,, descriptive, anal...
5    [bahaʼi, faith, asserts, evil, non, -, existen...
6    [august, 1993, ,, discovered, proof, contained...
7    [roko, basilisk, gained, significant, notoriet...
8    [1675, –, considerations, reconcileableness, r...
9    [nearly, 30, years, ehrenberg, examined, sampl...
Name: tokenized, dtype: object
0    [1896, ,, fitzgerald, john, perry, obtained, c...
1    [  , ., detailed, derivation, interpretations,...
2    [  , ,, having, minimal, patient, data, minori...
3    [product, stewardship, includes, waste, dispos...
4    [recursive, allocatable, components, –, altern...
5    [george, gaylord, simpson,   , american, paleo...
6    [assembly, languages,   , mnemonic, ,, mov, ,,...
7    [included, £, 40, stipend, .,

In [None]:
def remove_punctuation(tokens):
    doc = nlp(" ".join(tokens))  # Join tokens into a string and process with spaCy
    filtered_tokens = [token.text for token in doc if token.text not in string.punctuation]
    return filtered_tokens
X_train_final['tokenized'] = X_train_final['tokenized'].apply(remove_punctuation)
X_test_final['tokenized'] = X_test_final['tokenized'].apply(remove_punctuation)

In [None]:
print(X_train_final['tokenized'].head(10))
print(X_test_final['tokenized'].head(10))

In [None]:
def lemmatize_text(tokens):
    doc = nlp(" ".join(tokens))  # Join tokens into a string and process with spaCy
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens
X_train_final['tokenized'] = X_train_final['tokenized'].apply(lemmatize_text)
X_test_final['tokenized'] = X_test_final['tokenized'].apply(lemmatize_text)
print(X_train_final['tokenized'].head(10))
print(X_test_final['tokenized'].head(10))

0    [extension, bank, north, west, exterior, wall,...
1    [thomson, separation, neon, isotope, mass, exa...
2    [python, license, open, source, gpl, compatibl...
3    [    , let, clear, tom, kilburn, know, thing, ...
4    [general, system, botany, descriptive, analyti...
5    [bahaʼi, faith, assert, evil, non, existent, c...
6    [august, 1993, discover, proof, contain, flaw,...
7    [roko, basilisk, gain, significant, notoriety,...
8    [1675, –, consideration, reconcileableness, re...
9    [nearly, 30, year, ehrenberg, examine, sample,...
Name: tokenized, dtype: object
0    [1896, fitzgerald, john, perry, obtain, civil,...
1    [    , detailed, derivation, interpretation, e...
2    [    , have, minimal, patient, datum, minority...
3    [product, stewardship, include, waste, disposa...
4    [recursive, allocatable, component, –, alterna...
5    [george, gaylord, simpson,     , american, pal...
6    [assembly, language,     , mnemonic, mov, fami...
7    [include, £, 40, stipend, awa

In [None]:
print(X_train_final.head(10))

In [None]:
# X_train_final['tokenized'] = X_train_tfidf.toarray()
# X_test_final['tokenized'] = X_test_tfidf.toarray()
# X_train_final['bigram_tokenized'] = X_train_bigram.toarray()
# X_test_final['bigram_tokenized'] = X_test_bigram.toarray()
# print(X_train_final.head(10))

In [None]:
X_train_final.isnull().sum()

In [None]:

unigram_bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))  # Specify the range for unigrams and bigrams
X_train_unigram_bigram = unigram_bigram_vectorizer.fit_transform(X_train_final['tokenized'].apply(lambda x: ' '.join(x)))
X_test_unigram_bigram = unigram_bigram_vectorizer.transform(X_test_final['tokenized'].apply(lambda x: ' '.join(x)))


In [None]:
print(y_train_imputed.dtype)
print(X_train_unigram_bigram.dtype)

In [None]:
naive_bayes_classifier = MultinomialNB(alpha=200.0)
svm_classifier = SVC(C=200.0)

naive_bayes_pipeline = Pipeline(steps=[('classifier', naive_bayes_classifier)])
svm_pipeline = Pipeline(steps=[('classifier', svm_classifier)])

# Define the pipelines and their corresponding names
pipelines = [naive_bayes_pipeline, svm_pipeline]
pipeline_names = ['Naive Bayes', 'SVM']

# Iterate over each pipeline
for pipeline, name in zip(pipelines, pipeline_names):
    # Fit the pipeline
    pipeline.fit(X_train_unigram_bigram, y_train_imputed)

    # Predictions on the training set
    y_pred_train = pipeline.predict(X_train_unigram_bigram)
    cm_train = confusion_matrix(y_train_imputed, y_pred_train)
    print(f'{name} Training Confusion Matrix:')
    print(cm_train)

    # Predictions on the testing set
    y_pred_test = pipeline.predict(X_test_unigram_bigram)
    cm_test = confusion_matrix(y_test_imputed, y_pred_test)
    print(f'{name} Testing Confusion Matrix:')
    print(cm_test)

    # Calculate overall performance
    accuracy_train = np.mean(y_pred_train == y_train_imputed)
    accuracy_test = np.mean(y_pred_test == y_test_imputed)
    print(f'{name} Training Accuracy: {accuracy_train}')
    print(f'{name} Testing Accuracy: {accuracy_test}')


Naive Bayes Training Confusion Matrix:
[[ 673    0    0  328   43  197]
 [   0    0    0    6    0    5]
 [   2    0    0   19    2  106]
 [   8    0    0 1935    9  133]
 [   4    0    0  120 1381   77]
 [  16    0    0  170    8 2235]]
Naive Bayes Testing Confusion Matrix:
[[  3   0   0   0   0   1   0   0   0]
 [  0   0   0   0   0   2   0   0   0]
 [  0   0   3   0   0   0   0   0   0]
 [  0   0   1   2   0   0   0   0   0]
 [168   0  91  15   0  57   0   0   0]
 [  2   0  56   1   0 515   0   0   0]
 [  0   0   4   0   0  34   0   0   0]
 [  2   0 464   4   0  40   0   0   0]
 [  4   0  24 351   0  26   0   0   0]]
Naive Bayes Training Accuracy: 0.8324194195532968
Naive Bayes Testing Accuracy: 0.2796791443850267
SVM Training Confusion Matrix:
[[1231    0    0    0    0   10]
 [   0   11    0    0    0    0]
 [   0    0  128    0    0    1]
 [   0    0    0 2074    0   11]
 [   0    0    0    0 1576    6]
 [   9    0    1   13    7 2399]]
SVM Testing Confusion Matrix:
[[  4   0   0