# Model

# Importations

In [3]:
# Librairies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# import the train dataset
df_train = pd.read_csv('data/Train_topic.csv')
df_train.head(3)

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics,Analysis of PDEs,Applications,Artificial Intelligence,Astrophysics of Galaxies,...,Methodology,Number Theory,Optimization and Control,Representation Theory,Robotics,Social and Information Networks,Statistics Theory,Strongly Correlated Electrons,Superconductivity,Systems and Control
0,1824,a ever-growing datasets inside observational a...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3094,we propose the framework considering optimal $...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8463,nanostructures with open shell transition meta...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
# import the test dataset
df_test = pd.read_csv('data/Test_topic.csv')
df_test.head(3)

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics
0,9409,fundamental frequency (f0) approximation from ...,0,0,0,1
1,17934,"this large-scale study, consisting of 24.5 mil...",1,0,0,1
2,16071,we present a stability analysis of the plane c...,0,0,1,0


# Models

## Model LDA

In [None]:
# start model LDA

## Model Classification

### Imports

In [None]:
import nltk
nltk.download('stopwords') 
nltk.download('wordnet')
#nltk.download('movie_reviews')

import pandas as pd
#from nltk.corpus import movie_reviews, stopwords

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

### Data preparation

We will transform movie_reviews tagged corpus from nltk to a pandas dataframe with the script below:

In [None]:
# Script copied from here
reviews = []
for fileid in movie_reviews.fileids():
    tag, filename = fileid.split('/')
    reviews.append((tag, movie_reviews.raw(fileid)))
sample = pd.DataFrame(reviews, columns=['target', 'document'])
print(f'Dimensions: {sample.shape}')
sample.head()

You will see that the dataframe has 2 columns: a column for the targets, the polarity sentiment, and a column for the reviews (i.e. documents) for 2000 reviews. Each review is either tagged as positive or negative review. Let’s check the counts of the target classes

In [None]:
sample['target'].value_counts()

Each class (i.e. ‘pos’, ‘neg’) has 1000 records each, perfectly balanced. Let’s ensure that the classes are binary coded:

In [None]:
sample['target'] = np.where(sample['target']=='pos', 1, 0)
sample['target'].value_counts()

### Partition data

Split the sample data into 3 groups:
- train
- validation
- test

train is used to fit the model,
validation is used to evaluate fitness of interim models,
and test is used to assess final model fitness.

In [None]:
# Split train and test

X_train, X_test, y_train, y_test = train_test_split(sample['document'], sample['target'], test_size=0.3, random_state=123)
print(f'Train dimensions: {X_train.shape, y_train.shape}')
print(f'Test dimensions: {X_test.shape, y_test.shape}')

# Check out target distribution

print(y_train.value_counts())
print(y_test.value_counts())

### Preprocess documents

Preprocess training documents, that is to transform unstructured data to a matrix of numbers.

Let’s preprocess the text using an approach called bag-of-word where each text is represented by its words regardless of the order in which they are presented or the embedded grammar with the following steps:

- Tokenise
- Normalise
- Remove stop words
- Count vectorise
- Transform to tf-idf representation

In [None]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stop words
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to feature matrix
X_train_tfidf = vectoriser.fit_transform(X_train)
X_train_tfidf.shape

### Baseline model

In [None]:
# SGDClassifier model

sgd_clf = SGDClassifier(random_state=123)
sgf_clf_scores = cross_val_score(sgd_clf, X_train_tfidf, y_train, cv=5)
print(sgf_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (sgf_clf_scores.mean(), sgf_clf_scores.std() * 2))

In [None]:
# Cross val score

cross_val_score(sgd_clf, X_train_tfidf, y_train, cv=5, scoring='accuracy')

In [None]:
# Confusion matrix

sgf_clf_pred = cross_val_predict(sgd_clf, X_train_tfidf, y_train, cv=5)
print(confusion_matrix(y_train, sgf_clf_pred))

### Improving performance

The purpose of this section is to find the best machine learning algorithm as well as its hyperparameters

In [None]:
# Do a grid search in order to find the best values for the hyperparameters

grid = {'fit_intercept': [True,False],
        'early_stopping': [True, False],
        'loss' : ['hinge', 'log', 'squared_hinge'],
        'penalty' : ['l2', 'l1', 'none']}
search = GridSearchCV(estimator=sgd_clf, param_grid=grid, cv=5)
search.fit(X_train_tfidf, y_train)
search.best_params_

Let’s train and validate the model using these values for the selected hyperparameters

### Final model

In [None]:
# Pipeline

pipe = Pipeline([('vectoriser', vectoriser),
                 ('classifier', search.best_estimator_)])
pipe.fit(X_train, y_train)

In [None]:
# Confusion matrix

y_test_pred = pipe.predict(X_test)
print("Accuracy: %0.2f" % (accuracy_score(y_test, y_test_pred)))
print(confusion_matrix(y_test, y_test_pred))

### Test2

In [None]:
# imports

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
# import the train dataset

df = pd.read_csv('data/Train_topic.csv')
df.head(3)

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics,Analysis of PDEs,Applications,Artificial Intelligence,Astrophysics of Galaxies,...,Methodology,Number Theory,Optimization and Control,Representation Theory,Robotics,Social and Information Networks,Statistics Theory,Strongly Correlated Electrons,Superconductivity,Systems and Control
0,1824,a ever-growing datasets inside observational a...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3094,we propose the framework considering optimal $...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8463,nanostructures with open shell transition meta...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
df.shape

(14004, 31)

In [None]:
# Set the desired sample size
sample_size = 7000

# Create a random sample
random_sample = df.sample(n=sample_size, random_state=42)

# Save the random sample as a new Parquet file
random_sample.to_parquet('random_sample_7000.parquet')

In [None]:
#!pip install pyarrow

In [None]:
#!pip install fastparquet

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X = df_train.drop(['id', 'ABSTRACT'], axis=1)
y = df_train['ABSTRACT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# test set some topics dont match so not working
# X_train = df_train.drop(['id', 'ABSTRACT'], axis=1)
# y_train = df_train['ABSTRACT']
# X_test = df_test.drop(['id', 'ABSTRACT'], axis=1)
# y_test = df_test['ABSTRACT']

# Initialize and train the classifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X = df_train.drop(['id', 'ABSTRACT'], axis=1)
y = df_train['ABSTRACT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Models save