In [1]:
import joblib
from io import StringIO
import pickle
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langdetect import detect, LangDetectException
from nltk.sentiment import SentimentIntensityAnalyzer


In [2]:
model_filename = 'random_forest_modelTweets.joblib'

# Load the trained model
loaded_model = joblib.load(model_filename)


In [3]:
test="free //. palestine // %@user ;{"

In [4]:
class TweetProcessor:
    # Constructor
    def __init__(self, tweet_data):
        """
        Initializes the object with tweet data provided as a string.
        """
        self.tweets = tweet_data.split('\n')

    # Method to clean tweets
    def clean_tweet(self, tweet):
        """
        Cleans up a tweet by removing unnecessary links, special characters, and spaces.
        """
        # Delete links
        tweet = re.sub(r'https\S+', '', tweet, flags=re.MULTILINE)
        # Delete special characters
        tweet = re.sub(r'\W', ' ', tweet)
        # Replace multiple spaces with a single space
        tweet = re.sub(r'\s+', ' ', tweet)
        # Remove leading and trailing spaces
        tweet = tweet.strip()
        return tweet

    # Method to tokenize the tweets
    def tokenize_and_lemmatize(self, tweet):
        """
        Tokenize and lemmatize a tweet by converting to lowercase, removing stop words and lemmatizing.
        """
        # Division of text into units  (tokens)
        tokens = word_tokenize(tweet)
        # Remove punctuation and numbers, convert to lowercase
        tokens = [word.lower() for word in tokens if word.isalpha()]
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        # Lemmatization: the canonical form of a word, which represents its root or dictionary form
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        # Join tokens into a text string
        return ' '.join(tokens)

    def process_tweets(self):
        """
        Applies cleanup and lemmatization functions to tweets in the string and deletes non-English words.
        """
        cleaned_tweets = [self.clean_tweet(tweet) for tweet in self.tweets]
        lang_filtered_tweets = [tweet for tweet in cleaned_tweets if self.lang(tweet)]
        tokenized_and_lemmatized_tweets = [self.tokenize_and_lemmatize(tweet) for tweet in lang_filtered_tweets]

        return tokenized_and_lemmatized_tweets

    def lang(self, x):
        """
        Function that detects if a tweet is in English or not.
        """
        try:
            return detect(x) == 'en'
        except LangDetectException:
            return False

In [5]:

class SentimentAnalysisReport:
    def __init__(self, text):
        self.text = text
        self.analyzer = SentimentIntensityAnalyzer()

    def calculate_sentiment_scores(self):
        """
        Calculate sentiment scores.
        """
        # Check if the entry is a valid string
        if isinstance(self.text, str):
            compound = self.analyzer.polarity_scores(self.text)["compound"]
            pos = self.analyzer.polarity_scores(self.text)["pos"]
            neu = self.analyzer.polarity_scores(self.text)["neu"]
            neg = self.analyzer.polarity_scores(self.text)["neg"]

            self.sentiment_scores = {
                "Compound": compound,
                "Positive": pos,
                "Negative": neg,
                "Neutral": neu
            }
        else:
            # Handle non-string values
            self.sentiment_scores = {
                "Compound": 0.0,
                "Positive": 0.0,
                "Negative": 0.0,
                "Neutral": 1.0
            }

    def categorize_sentiment(self):
        """
        Categorize sentiment based on the Compound score.
        """
        compound_score = self.sentiment_scores.get("Compound", 0.0)

        if compound_score <= -0.5:
            self.sentiment_category = 'Negative'
        elif -0.5 < compound_score < 0.5:
            self.sentiment_category = 'Neutral'
        else:
            self.sentiment_category = 'Positive'

    def display_sentiment(self):
        """
        Display the sentiment scores and category.
        """
        print("Sentiment Scores:")
        for key, value in self.sentiment_scores.items():
            print(f"{key}: {value}")

        print("Sentiment Category:", self.sentiment_category)


In [6]:
tweet_processor = TweetProcessor(test)
cleand = tweet_processor.process_tweets()
cleand

[]

In [7]:
# Load your pretrained model DOC2VEC
model = Doc2Vec.load('trained_doc2vec_model.pkl')

In [8]:
inferred_vector = model.infer_vector(cleand)
inferred_vector

array([-4.1048029e-03,  2.1048170e-03,  3.8674616e-03, -2.1194341e-03,
       -3.0556857e-03,  3.8917405e-03, -4.8500113e-03,  9.1276347e-04,
        4.9865870e-03,  2.9245378e-03,  3.2507444e-03,  1.9515931e-04,
        3.1592620e-03, -4.8182495e-03,  3.2734632e-04,  1.2014884e-03,
        4.8894952e-03, -1.5125013e-03, -2.0774028e-03, -1.9986033e-03,
       -4.9475795e-03, -1.2206477e-03, -4.9879537e-03, -1.2341264e-03,
       -1.5432852e-03,  2.9863208e-03, -4.6120239e-03, -2.6199460e-04,
       -2.6065835e-03,  1.0669709e-03, -1.3504979e-03, -2.6406394e-03,
        2.0917701e-03,  4.6508447e-03,  1.1680013e-03,  1.8019611e-03,
       -4.5624925e-03, -1.8839276e-03, -1.9951828e-03,  4.5213252e-03,
        3.7145007e-03, -1.6019195e-03, -3.1930518e-03, -2.7568424e-03,
       -2.7907521e-03, -4.2059505e-03,  3.2386810e-03,  6.4693985e-04,
        6.7215861e-04, -1.1957869e-03, -2.0101955e-03, -2.7457490e-03,
        3.7305712e-04, -4.7308714e-03,  1.3968295e-03,  4.5433831e-03,
      

In [11]:

predicted_support = loaded_model.predict([inferred_vector])


print("Predicted Support:", predicted_support)


sentiment_report = SentimentAnalysisReport(cleand)

sentiment_report.calculate_sentiment_scores()

sentiment_report.categorize_sentiment()

sentiment_report.display_sentiment()


Predicted Support: [1.]
Sentiment Scores:
Compound: 0.0
Positive: 0.0
Negative: 0.0
Neutral: 1.0
Sentiment Category: Neutral
