In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
df = pd.read_csv("tweets.csv", encoding="ISO-8859-1")
df.head()

In [None]:
df.info


In [None]:
df = df.dropna(subset=['text'])
df['text'].fillna('Unknown', inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
text_df = df.drop(['Unnamed: 0',  'favorited', 'favoriteCount', 'replyToSN',
       'created', 'truncated', 'replyToSID', 'id', 'replyToUID',
       'statusSource', 'screenName', 'retweetCount', 'isRetweet', 'retweeted',
       'longitude', 'latitude'], axis=1)
text_df.head()

In [None]:
print(text_df['text'].iloc[0],'\n')
print(text_df['text'].iloc[1],'\n')
print(text_df['text'].iloc[2],'\n')
print(text_df['text'].iloc[3],'\n')
print(text_df['text'].iloc[4],'\n')

In [None]:
import re

def preprocess_tweet(sen):
    '''Cleans sentence data, handling NaN, and removing extra whitespaces.'''
    
    # Check for NaN values
    if pd.isna(sen):
        return ""

    # Convert to lowercase
    sentence = sen.lower()
    sentence = re.sub(r"https\S+|www\S+https\S+", '',sentence, flags=re.MULTILINE)
    sentence = re.sub(r'\@w+|\#','',sentence)
    sentence = re.sub(r'[^\w\s]','',sentence)
    # Remove RT
    sentence = re.sub(r'^rt\s+', '', sentence)

    # Remove special characters, URLs, and usernames
    sentence = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", sentence)

    # Remove single characters
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Strip leading and trailing whitespace
    sentence = sentence.strip()
    sentence_tokens = word_tokenize(sentence)
    filtered_text = [w for w in sentence_tokens if not w in stop_words]
    return " ".join(filtered_text)
   

In [None]:
cleaned_tweets = []

for tweet in text_df['text']:
  cleaned_tweet = preprocess_tweet(tweet)
  cleaned_tweets.append(cleaned_tweet)

In [None]:
stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

In [None]:
text_df['cleaned'] = pd.DataFrame(cleaned_tweets)
text_df.head(5)

In [None]:
text_df['cleaned'].fillna('', inplace=True)

In [None]:
text_df['cleaned'] = text_df['cleaned'].apply(lambda x: stemming(x))

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
import pandas as pd

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

# Assuming you have a DataFrame called text_df with a 'cleaned' column
# Convert the 'cleaned' column to strings
text_df['cleaned'] = text_df['cleaned'].astype(str)

# Apply sentiment analysis using TextBlob
text_df[['polarity', 'subjectivity']] = text_df['cleaned'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

# Perform sentiment analysis using VADER
analyzer = SentimentIntensityAnalyzer()
text_df['vader_score'] = text_df['cleaned'].apply(lambda text: analyzer.polarity_scores(text)['compound'])

for index, row in text_df['cleaned'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    comp = score['compound']
    if comp <= -0.05:
        text_df.loc[index, 'sentiment'] = "negative"
    elif comp >= 0.05:
        text_df.loc[index, 'sentiment'] = "positive"
    else:
        text_df.loc[index, 'sentiment'] = "neutral"
    text_df.loc[index, 'neg'] = neg
    text_df.loc[index, 'neu'] = neu
    text_df.loc[index, 'pos'] = pos
    text_df.loc[index, 'compound'] = comp

text_df.head(5)

In [None]:
fig = plt.figure(figsize=(5,5))
sns.countplot(x='sentiment', data = text_df)

In [None]:
fig = plt.figure(figsize=(7,7))
colors = ("yellowgreen", "gold", "red")
wp = {'linewidth':2, 'edgecolor':"black"}
tags = text_df['sentiment'].value_counts()
explode = (0.1,0.1,0.1)
tags.plot(kind='pie', autopct='%1.1f%%', shadow=True, colors = colors,
         startangle=90, wedgeprops = wp, explode = explode, label='')
plt.title('Distribution of sentiments')

In [None]:
neu_tweets = text_df[text_df.sentiment == 'neutral']
neu_tweets = neu_tweets.sort_values(['polarity'], ascending= False)
neu_tweets.head()

In [None]:
text = ' '.join([word for word in neu_tweets['cleaned']])
plt.figure(figsize=(20,15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in neutral tweets', fontsize=19)
plt.show()

In [None]:
pos_tweets = text_df[text_df.sentiment == 'positive']
pos_tweets = pos_tweets.sort_values(['polarity'], ascending= False)
pos_tweets.head()

In [None]:
pos_tweets = pos_tweets.dropna(subset=['cleaned'])
text = ' '.join([word for word in pos_tweets['cleaned']])
plt.figure(figsize=(20,15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in positive tweets', fontsize=19)
plt.show()

In [None]:
neg_tweets = text_df[text_df.sentiment == 'negative']
neg_tweets = neg_tweets.sort_values(['polarity'], ascending= False)
neg_tweets.head()

In [None]:
text = ' '.join([word for word in neg_tweets['cleaned']])
plt.figure(figsize=(20,15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in negative tweets', fontsize=19)
plt.show()

In [None]:
vect = CountVectorizer(ngram_range=(1,2)).fit(text_df['cleaned'])

In [None]:
feature_names = vect.get_feature_names_out()
print("Number of features: {}\n".format(len(feature_names)))
print("First 20 features:\n {}".format(feature_names[:20]))

In [None]:
X = text_df['cleaned']
Y = text_df['sentiment']
X = vect.transform(X)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
print("Size of x_train:", (x_train.shape))
print("Size of y_train:", (y_train.shape))
print("Size of x_test:", (x_test.shape))
print("Size of y_test:", (y_test.shape))

In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_pred = logreg.predict(x_test)
logreg_acc = accuracy_score(logreg_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

In [None]:
print(confusion_matrix(y_test, logreg_pred))
print("\n")
print(classification_report(y_test, logreg_pred))

In [None]:
style.use('classic')
cm = confusion_matrix(y_test, logreg_pred, labels=logreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=logreg.classes_)
disp.plot()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid={'C':[0.001, 0.01, 0.1, 1, 10]}

grid = GridSearchCV(LogisticRegression(), param_grid)
grid.fit(x_train, y_train)

In [None]:
print("Best parameters:", grid.best_params_)

In [None]:
y_pred = grid.predict(x_test)

In [None]:
logreg_acc = accuracy_score(y_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

In [None]:
print(confusion_matrix(y_test, y_pred))
print("\n")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.svm import LinearSVC

In [None]:
SVCmodel = LinearSVC()
SVCmodel.fit(x_train, y_train)

In [None]:
svc_pred = SVCmodel.predict(x_test)
svc_acc = accuracy_score(svc_pred, y_test)
print("test accuracy: {:.2f}%".format(svc_acc*100))

In [None]:
print(confusion_matrix(y_test, svc_pred))
print("\n")
print(classification_report(y_test, svc_pred))

In [None]:
grid = {
    'C':[0.01, 0.1, 1, 10],
    'kernel':["linear","poly","rbf","sigmoid"],
    'degree':[1,3,5,7],
    'gamma':[0.01,1]
}
grid = GridSearchCV(SVCmodel, param_grid)
grid.fit(x_train, y_train)

In [None]:
print("Best parameter:", grid.best_params_)

In [None]:
y_pred = grid.predict(x_test)

In [None]:
logreg_acc = accuracy_score(y_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

In [None]:
print(confusion_matrix(y_test, y_pred))
print("\n")
print(classification_report(y_test, y_pred))

### experimental performanes

#### tokenization filtration and script validation

In [None]:
import re
from nltk.tokenize import word_tokenize
def preprocess_tweet(sen):
    '''Cleans sentence data, handling NaN, and removing extra whitespaces.'''
    
    # Check for NaN values
    if pd.isna(sen):
        return ""

    # Convert to lowercase
    sentence = sen.lower()
    sentence = re.sub(r"https\S+|www\S+https\S+", '',sentence, flags=re.MULTILINE)
    sentence = re.sub(r'\@w+|\#','',sentence)
    sentence = re.sub(r'[^\w\s]','',sentence)
    # Remove RT
    sentence = re.sub(r'^rt\s+', '', sentence)

    # Remove special characters, URLs, and usernames
    sentence = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", sentence)

    # Remove single characters
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Strip leading and trailing whitespace
    sentence = sentence.strip()
    sentence = word_tokenize(sentence)
    return sentence
   
    
a=preprocess_tweet(text)
print(a)

#### stopwords removal

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
sentence_tokens = word_tokenize(text)
filtered_text = [w for w in sentence_tokens if not w in stop_words]
cleaned_text = " ".join(filtered_text)
print(cleaned_text)

#### stemming

In [None]:

from nltk.stem import PorterStemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

s= stemming(cleaned_text)
print(s)

#### lemmatization

In [None]:
import nltk

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
# import these modules
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
s= lemmatizer.lemmatize(cleaned_text)
print(s)


#### morphological analysis

In [None]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Input sentence for analysis
sentence =cleaned_text

# Tokenize the sentence into words
words = sentence_tokens

# Perform part-of-speech tagging
pos_tags = nltk.pos_tag(words)

# Initialize an empty list to store the analysis data
analysis_data = []

# Loop through each word and perform morphological analysis
for word, pos_tag in pos_tags:
    lemma = lemmatizer.lemmatize(word)
    morph_features = [f"{tag[0]}={tag[1]}" for tag in pos_tag.split("|")]
    morphological_features = "|".join(morph_features)
    analysis_data.append(
        {"Token": word, "Lemma": lemma, "POS": pos_tag, "Morphological Features": morphological_features}
    )

# Print the formatted output
print("Token   | Lemma    | POS ")
print("=" * 50)
for item in analysis_data:
    print(f"{item['Token']:<35} | {item['Lemma']:<35} | {item['POS']:<10} ")


#### n-gram model


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
# Create and train the model
def train_sentiment_model(data):
    X_train = text_df['cleaned']
    y_train = text_df['sentiment']
    
    vectorizer = CountVectorizer(ngram_range=(1, 3))
    X_train_vec = vectorizer.fit_transform(X_train)
    
    classifier = MultinomialNB()
    classifier.fit(X_train_vec, y_train)
    
    return vectorizer, classifier

# Predict sentiment for new text input
def predict_sentiment(text, vectorizer, classifier):
    X_test = [text]
    X_test_vec = vectorizer.transform(X_test)
    
    sentiment = classifier.predict(X_test_vec)
    
    return sentiment[0]

# Train the model
vectorizer, classifier = train_sentiment_model(df)

# Example usage: Predict sentiment for a new text
new_text = input('enter the new input:  ')
predicted_sentiment = predict_sentiment(new_text, vectorizer, classifier)
print(f"Predicted sentiment: {predicted_sentiment}")


In [None]:
def create_ngrams(sentence, n):
    words = sentence.split()
    ngrams = [tuple(words[i:i+n]) for i in range(len(words) - (n - 1))]
    return ngrams

def build_ngram_model(corpus, n):
    ngram_model = {}
    sentences = corpus.split('. ')
    for sentence in sentences:
        ngrams = create_ngrams(sentence, n)
        for ngram in ngrams:
            prefix, suffix = ngram[:-1], ngram[-1]
            ngram_model.setdefault(prefix, []).append(suffix)
    return ngram_model

def predict_next_word(prefix, ngram_model):
    possible_next_words = ngram_model.get(prefix, [])
    if possible_next_words:
        return max(set(possible_next_words), key=possible_next_words.count)
    return None

n = int(input("Enter the value of N: "))
corpus =cleaned_text

ngram_model = build_ngram_model(corpus, n)

input_prefix = tuple(input(f"Enter a prefix of length {n-1}: ").split())
predicted_word = predict_next_word(input_prefix, ngram_model)

if predicted_word:
    print(f"Predicted next word: {predicted_word}")
else:
    possible_prefix = max((prefix for prefix in ngram_model if prefix[:-1] == input_prefix[1:]), key=len, default=None)
    if possible_prefix:
        possible_next_words = ngram_model[possible_prefix]
        most_likely_word = max(set(possible_next_words), key=possible_next_words.count)
        print(f"No direct prediction. Predicting the most likely word based on frequency: {most_likely_word}")
    else:
        print("No prediction available for the given prefix.")


In [None]:
#predict the sentiment
import nltk
from nltk import RegexpParser
from nltk.tokenize import word_tokenize

text = cleaned_text

# Tokenize the text into words
words = word_tokenize(text)

# Perform part-of-speech tagging
pos_tags = nltk.pos_tag(words)
# Define a grammar for noun phrases (NP)
grammar = r"""
    NP: {<DT>?<JJ>*<NN>}  # Chunk sequences of determiners, adjectives, and nouns
"""
# Create a chunk parser with the defined grammar
chunk_parser = RegexpParser(grammar)
# Parse the POS-tagged words to find noun phrases
tree = chunk_parser.parse(pos_tags)
print(tree)




#### POS Tagging


In [None]:
import nltk
from nltk.tokenize import word_tokenize
text = cleaned_text

# Tokenize the text into words
sentence_tokens = word_tokenize(text)
words = sentence_tokens

# Perform POS tagging
pos_tags = nltk.pos_tag(words)

# Print the POS tags
print("Word  |  POS Tag")
print("-" * 15)
for word, pos_tag in pos_tags:
    print(f"{word:<30} | {pos_tag}")


#### chunking of the text

In [None]:
import nltk
from nltk import RegexpParser
from nltk.tokenize import word_tokenize
nltk.download('punkt')

text = cleaned_text

# Tokenize the text into words
sentence_tokens = word_tokenize(text)
words= sentence_tokens

# Perform part-of-speech tagging
pos_tags = nltk.pos_tag(words)

# Define a grammar for noun phrases (NP)
grammar = r"""
    NP: {<DT>?<JJ>*<NN>}  # Chunk sequences of determiners, adjectives, and nouns
"""
# Create a chunk parser with the defined grammar
chunk_parser = RegexpParser(grammar)

# Parse the POS-tagged words to find noun phrases
tree = chunk_parser.parse(pos_tags)

print(tree)# can use tree.pretty_print() method


#### NER

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm/en_core_web_sm-3.6.0/")
doc = nlp(cleaned_text)
for ent in doc.ents:
    print(ent.text, ent.label_)

#### text similarity recognizer


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm/en_core_web_sm-3.6.0/")

document1 = "I find joy in exploring the wonders of natural language processing."
document2 = "NLP is a fascinating discipline that intersects with artificial intelligence."
document3 = "Machine learning, an integral part of AI, empowers algorithms to decipher patterns."

doc1 = nlp(document1)
doc2 = nlp(document2)
doc3 = nlp(document3)

similarity_doc1_doc2 = doc1.similarity(doc2)
similarity_doc1_doc3 = doc1.similarity(doc3)
similarity_doc2_doc3 = doc2.similarity(doc3)

# Print the similarity scores
print("Similarity between document 1 and document 2:", similarity_doc1_doc2)
print("Similarity between document 1 and document 3:", similarity_doc1_doc3)
print("Similarity between document 2 and document 3:", similarity_doc2_doc3)


In [None]:
for text in text_df.loc[text_df['sentiment'] == 'positive', 'cleaned']:
    print(text)


In [None]:
import spacy


# Load the spaCy language model
nlp = spacy.load("en_core_web_sm/en_core_web_sm-3.6.0/")

# Assuming you have already created the DataFrame text_df
# Replace 'text_df' with your actual DataFrame name

# Get cleaned texts for each sentiment
positive_texts = text_df.loc[text_df['sentiment'] == 'positive', 'cleaned'].tolist()
negative_texts = text_df.loc[text_df['sentiment'] == 'negative', 'cleaned'].tolist()
neutral_texts = text_df.loc[text_df['sentiment'] == 'neutral', 'cleaned'].tolist()

# Concatenate the texts into single strings
document1 = ' '.join(positive_texts)
document2 = ' '.join(negative_texts)
document3 = ' '.join(neutral_texts)

# Process the documents using spaCy
doc1 = nlp(document1)
doc2 = nlp(document2)
doc3 = nlp(document3)

# Calculate similarity scores (cosine similarity) between documents
similarity_doc1_doc2 = doc1.similarity(doc2)
similarity_doc1_doc3 = doc1.similarity(doc3)
similarity_doc2_doc3 = doc2.similarity(doc3)

# Print the similarity scores
print("Similarity between document 1 and document 2:", similarity_doc1_doc2)
print("Similarity between document 1 and document 3:", similarity_doc1_doc3)
print("Similarity between document 2 and document 3:", similarity_doc2_doc3)
