In [13]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import pandas as pd

# Load dataset (assuming it's in CSV or TXT format)
df = pd.read_csv('/content/drive/MyDrive/arxiv_papers.csv')  # Update path to your file


In [19]:
print(df.columns)

Index(['abstract', 'author', 'date', 'pdf_url', 'title', 'pdf_text'], dtype='object')


In [21]:
import re
import nltk # Removed extra spaces before this line
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text cleaning function
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)   # Remove numbers
    text = text.lower()               # Convert to lowercase
    return text

# Apply text cleaning and lemmatization
# Replace 'text_column' with the correct column name (e.g., 'abstract')
df['cleaned_text'] = df['abstract'].apply(clean_text)  # Changed here
df['lemmatized_text'] = df['cleaned_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words]))

df[['cleaned_text', 'lemmatized_text']].head()  # Check the result

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,cleaned_text,lemmatized_text
0,we first present our view of detection and cor...,first present view detection correction syntac...
1,we first present our view of detection and cor...,first present view detection correction syntac...
2,the choice of modeling units is critical to au...,choice modeling unit critical automatic speech...
3,why should computers interpret language increm...,computer interpret language incrementally rece...
4,stance detection is a classification problem i...,stance detection classification problem natura...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [22]:
import gensim.downloader as api

# Download pre-trained GloVe model
glove_model = api.load('glove-wiki-gigaword-100')

# Convert words into vectors using GloVe
def get_vector(text):
    return [glove_model[word] for word in text.split() if word in glove_model]

df['word_vectors'] = df['lemmatized_text'].apply(get_vector)
df[['lemmatized_text', 'word_vectors']].head()




Unnamed: 0,lemmatized_text,word_vectors
0,first present view detection correction syntac...,"[[-0.020102, 0.037514, 0.35363, 0.16576, 0.094..."
1,first present view detection correction syntac...,"[[-0.020102, 0.037514, 0.35363, 0.16576, 0.094..."
2,choice modeling unit critical automatic speech...,"[[-0.017928, 0.35113, 0.51695, -0.28068, -0.16..."
3,computer interpret language incrementally rece...,"[[-0.16298, 0.30141, 0.57978, 0.066548, 0.4583..."
4,stance detection classification problem natura...,"[[0.4768, 0.027494, 0.13676, 0.097307, -0.7100..."





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize vectorizer
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['lemmatized_text'])

# Show Bag of Words output
print(X_bow.toarray())


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [25]:
from tensorflow.keras.preprocessing.text import one_hot # Changed the import statement

# Apply one-hot encoding
df['one_hot'] = df['lemmatized_text'].apply(lambda x: one_hot(x, n=10000))  # Specify vocab size
df[['lemmatized_text', 'one_hot']].head()

Unnamed: 0,lemmatized_text,one_hot
0,first present view detection correction syntac...,"[8646, 4208, 7996, 5318, 5066, 3665, 494, 9810..."
1,first present view detection correction syntac...,"[8646, 4208, 7996, 5318, 5066, 3665, 494, 9810..."
2,choice modeling unit critical automatic speech...,"[970, 6414, 7232, 2834, 899, 8303, 4403, 1695,..."
3,computer interpret language incrementally rece...,"[825, 5118, 5908, 8703, 5736, 7228, 3125, 6034..."
4,stance detection classification problem natura...,"[9960, 5318, 8918, 7445, 3564, 5908, 4046, 256..."


In [26]:
nltk.download('averaged_perceptron_tagger')

# Apply POS tagging
df['pos_tags'] = df['lemmatized_text'].apply(lambda x: nltk.pos_tag(x.split()))
df[['lemmatized_text', 'pos_tags']].head()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Unnamed: 0,lemmatized_text,pos_tags
0,first present view detection correction syntac...,"[(first, JJ), (present, JJ), (view, NN), (dete..."
1,first present view detection correction syntac...,"[(first, JJ), (present, JJ), (view, NN), (dete..."
2,choice modeling unit critical automatic speech...,"[(choice, NN), (modeling, VBG), (unit, NN), (c..."
3,computer interpret language incrementally rece...,"[(computer, NN), (interpret, JJ), (language, N..."
4,stance detection classification problem natura...,"[(stance, NN), (detection, NN), (classificatio..."


In [27]:
from textblob import TextBlob

# Sentiment analysis using TextBlob
df['sentiment'] = df['lemmatized_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df[['lemmatized_text', 'sentiment']].head()


Unnamed: 0,lemmatized_text,sentiment
0,first present view detection correction syntac...,0.151623
1,first present view detection correction syntac...,0.151623
2,choice modeling unit critical automatic speech...,0.132583
3,computer interpret language incrementally rece...,0.072222
4,stance detection classification problem natura...,0.21369


In [28]:
import spacy

# Load the English NLP model from spaCy
nlp = spacy.load('en_core_web_sm')

# Apply Named Entity Recognition
df['ner'] = df['lemmatized_text'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])
df[['lemmatized_text', 'ner']].head()


Unnamed: 0,lemmatized_text,ner
0,first present view detection correction syntac...,"[(first, ORDINAL)]"
1,first present view detection correction syntac...,"[(first, ORDINAL)]"
2,choice modeling unit critical automatic speech...,"[(phoneme modeling, ORG), (english, NORP), (ma..."
3,computer interpret language incrementally rece...,"[(recent year, DATE)]"
4,stance detection classification problem natura...,"[(stance detection classification, ORG), (turk..."


In [32]:
import spacy
import nltk
from textblob import TextBlob
from tensorflow.keras.preprocessing.text import one_hot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ... (your previous code to create and process df) ...

# Discretize the 'sentiment' column into categories
# You can adjust the thresholds as needed
df['sentiment_category'] = pd.cut(df['sentiment'], bins=[-1, -0.1, 0.1, 1], labels=['negative', 'neutral', 'positive'])

# Use the new 'sentiment_category' column as the target variable
actual_label_column = 'sentiment_category'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_bow, df[actual_label_column], test_size=0.2
)

# Train the model and evaluate its performance
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8353658536585366


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Use the 'sentiment_category' column as the target variable
label_column = 'sentiment_category'  # Replace 'label_column' with the actual column name

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_bow, df[label_column], test_size=0.2)

# Train Logistic Regression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict and check accuracy
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8487224157955865


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
