### 1. Download the datasets from the course website, unzip (7z program) and load it with pandas.
### (Hint: df = pd.read_json("..json", orient='records', lines=True)
### Use the following site from arxiv (https://arxiv.org/category_taxonomy) to taxonomize the documents into 5 categories: {0:'Computer Science', 1:'Mathematics', 2:'Statistics', 3:'Economics', 4:'EESS'}. This will constitute the ground truth. To create labels, use the first category entry.
### (Hint: Use the regular expression re.search(r'^([\w\-]+)', cat.split(' ')[0]).group(1) on the categories column. Check: [110347, 130726, 13920, 924, 7252] and [111091, 130287, 13672, 871, 7249] in training and testing sizes, respectively)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re


# Load the training and testing datasets
df_train = pd.read_json("arxiv_training.json", orient='records', lines=True)
df_test = pd.read_json("arxiv_testing.json", orient='records', lines=True)


In [2]:
def sanitize(cat):
    sant = re.search(r'^([\w\-]+)', cat.split(' ')[0]).group(1)
    return sant if sant in {'cs', 'stat', 'math', 'econ', 'eess'} else 'NA'

df_train['categories'] = df_train['categories'].apply(sanitize)
df_test['categories'] = df_test['categories'].apply(sanitize)

df_train['update_date'] = pd.to_datetime(df_train['update_date'])
df_test['update_date'] = pd.to_datetime(df_test['update_date'])

df_train = df_train[(df_train['categories'] != 'NA') & (df_train['update_date'].dt.year >= 2010)]
df_test = df_test[(df_test['categories'] != 'NA') & (df_test['update_date'].dt.year >= 2010)]

In [3]:
from collections import Counter
Counter(df_train['categories']),Counter(df_test['categories'])

(Counter({'math': 149465,
          'cs': 115106,
          'stat': 14305,
          'eess': 7285,
          'econ': 903}),
 Counter({'math': 148482,
          'cs': 116397,
          'stat': 14251,
          'eess': 7217,
          'econ': 892}))

### 2. [30 pts] Classify the documents and report testing accuracy (confusion matrix?).
### (Hint: Use TfidfVectorizer with a Pipeline. If you use a random forest make sure the max_depth parameter is reasonable, i.e. <30, to make sure it finishes faster. Any classifier you use has to know how to deal with sparse matrices.)

In [4]:
# # Split the data into training and testing sets
X_train, y_train = df_train['abstract'], df_train['categories']
X_test, y_test = df_test['abstract'], df_test['categories']

In [15]:
# Create a pipeline with CountVectorizer, TfidfTransformer, and a MultinomialNB classifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler  # Example data preprocessing step
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
N_FEATURES=10
pipeline = Pipeline([('scaler', TfidfVectorizer()),  # Standardize the data
    ('classifier', LogisticRegression(max_iter=100)),
                   ])
pipeline.fit(X_train, y_train)
# Predict the categories on the test data
y_pred = pipeline.predict(X_test)
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Testing Accuracy:", accuracy)

# Generate and display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Testing Accuracy: 0.8971588119997632
Confusion Matrix:
[[106964     21   1141   6762   1509]
 [   363    138      0    181    210]
 [  5366      3   1288    476     84]
 [  5490     17    140 141403   1432]
 [  4426     32     54   1833   7906]]


In [6]:
from sklearn.metrics import accuracy_score, classification_report
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(classification_rep)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.7928902412276885
              precision    recall  f1-score   support

          cs       0.78      0.76      0.77    116397
        econ       0.00      0.00      0.00       892
        eess       0.19      0.00      0.00      7217
        math       0.80      0.94      0.86    148482
        stat       0.63      0.02      0.03     14251

    accuracy                           0.79    287239
   macro avg       0.48      0.34      0.33    287239
weighted avg       0.77      0.79      0.76    287239



  _warn_prf(average, modifier, msg_start, len(result))


### State the class with the least performance. Why?

The class with the least performance is econ and eess. The reason for this is largely due to two things. The first thing is that eess and econ both contain the smallest number of entries, which can lead to a less accurate definer. The next issue is that the classifier is overwhelmingly skewed towards falsely identifying eess and econ as a computer science paper. This is likely due to the fact that a large number of papers hee =

### 3. [30 pts] Apply POS processing and pick only nouns as tokens (i.e., <NN>+, <NNP>+, etc.).
### Repeat and report any classification improvement in your pipeline in (2.).
### Can you suggest other tags to improve the pipeline?
### (Note that POS will take a long time (up to 1 hour) in this large dataset and suggest saving the tokens in another DataFrame. In addition, once tokenized and stored one can use TfidfVectorizer(tokenizer=lambda x:x, lowercase=False) to work on the tokenized list directly. Make sure you test your tagger on small datasets first. Suggest PerceptronTagger())

In [83]:
abs_df_train=df_train['abstract']
cat_df_train=df_train['categories']




test=abs_df_train.iloc[0:20]
test_cat=cat_df_train.iloc[0:20]

In [75]:
def extract_nouns(text):
    # Tokenize the text using a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # Perform POS tagging on the tokens using the PerceptronTagger
    pos_tags = PerceptronTagger().tag(tokens)
    
    # Extract nouns (NN, NNS, NNP, NNPS)
    nouns = [token for token, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS']]
    
    return nouns

In [87]:
from nltk.tokenize import RegexpTokenizer
from nltk.tag import PerceptronTagger
df_nouns = pd.DataFrame()
df_nouns['nouns'] = test

# Save this new DataFrame for future use, as POS tagging can be time-consuming
df_nouns.to_csv('nouns.csv', index=False)

In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the saved nouns DataFrame
df_nouns = pd.read_csv('nouns.csv')

# Create a TfidfVectorizer with custom tokenizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)

# Fit and transform the nouns to obtain the TF-IDF features

act_test=df_train['categories'].iloc[0:20].apply(extract_nouns)


tfidf_matrix = tfidf_vectorizer.fit_transform(df_nouns['nouns'])
from sklearn.model_selection import train_test_split

# Assuming you have a target variable 'labels' in your original DataFrame

from sklearn.linear_model import LogisticRegression

# Initialize and train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)
from sklearn.metrics import accuracy_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.25


