In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer


In [2]:

model_path = 'G:/Malk/Qafza/Final_Project/models/Model'
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


### Data preorocess

In [12]:
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from unidecode import unidecode 
from collections import Counter
from langdetect import detect
import seaborn as sns
import pandas as pd
import nltk
import re

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("G:\\Malk\\Qafza\\Final_Project\\data\\news.csv")

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Additional custom stopwords (optional)
custom_stopwords = {
    "will", "new", "york", "years", "one", "said", "united", "state", "trump", "time",
    "country", "two", "city", "china", "first", "woman", "american", "make", "made",
    "work", "company", "take", "family", "president", "government", "plan", "life",
    "people", "say", "says", "saying", "may", "show", "look", "help", "many", "home",
    "year", "day", "even", "women", "team", "teams", "states", "child", "russia", "would",
    "part", "world", "want", "set", "way", "found", "group", "played", "playing", "time", "election", "charge",
    "player", "play", "countries", "country", "plays", "become", "becomes", "became", "right",
    "three", "come", "needing", "came", "comes", "weeks", "week", "need", "needed",
    "needs", "official", "still", "including", "former", "last", "party", "star",
    "back", "place", "change", "return", "leader", "offer", "history", "season",
    "support", "couple", "met", "know", "find", "hope", "others", "power", "game",
    "talk", "toke", "token", "call", "called", "calling", "calls", "million", "can",
    "give", "given", "giving", "gives", "go", "going", "gone", "goes", "could", "get", "also", "open",
    "take", "taken", "taking", "takes", "bring", "bringing", "brought", "brings",
    "old", "run", "running", "ran", "runs", "use", "used", "using", "uses",
    "try", "trying", "tried", "tries", "artist", "business", "police", "report", "protest", "case", "start", "started",
    "starts", "end", "ending", "ended", "much", "big", "large", "top", "official", "case", "month", "plan", "appear", "live", "long", "man",
    "move", "moves", "moved", "moving", "tell", "tells", "told", "telling", "face", "faces", "faced", "facing",
    "show", "shows", "showed", "showing", "know", "knows", "knew", "knowing", "offer", "offers", "offered", "offering",
    "begin", "begins", "began", "beginning", "hold", "holds", "held", "holding", "put", "puts", "putting", "took",
    "bring", "brings", "brought", "bringing", "call", "calls", "called", "calling", "run", "runs", "ran", "running", "use", "uses", "used", "using",
    "try", "tries", "tried", "trying", "see", "sees", "saw", "seeing", "seen", "seening"
}
all_stopwords = stop_words.union(custom_stopwords)

# Function to normalize text (lowercase, remove special characters)
def normalize_text(text):
    text = text.lower().strip() 
    text = unidecode(text) 
    text = re.sub(r'[^a-z\s]', ' ', text) 
    text = re.sub(r'\s+', ' ', text)
    return text

# Function to map POS tags for better lemmatization
def get_wordnet_pos_bulk(words):
    tag_dict = defaultdict(lambda: wordnet.NOUN, {"J": wordnet.ADJ, "V": wordnet.VERB, "R": wordnet.ADV})
    return [(word, tag_dict.get(pos[0].upper(), wordnet.NOUN)) for word, pos in nltk.pos_tag(words)]

# Main preprocessing function
def preprocess_text(text):
    if not isinstance(text, str) or len(text) < 10: 
        return None  
    text = normalize_text(text)  # Normalize text

    # Tokenize and remove stopwords
    words = [word for word in word_tokenize(text) if word not in all_stopwords and len(word) > 2]
    
    # POS tagging and lemmatization
    tagged_words = get_wordnet_pos_bulk(words)
    cleaned_words = [lemmatizer.lemmatize(word, pos) for word, pos in tagged_words]

    return ' '.join(cleaned_words) if cleaned_words else None

[nltk_data] Downloading package punkt to C:\Users\Aya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Aya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Model

In [13]:
new_text = "The stock market is expected to rise tomorrow."
new_text=preprocess_text(new_text)
inputs = tokenizer(new_text, return_tensors='pt')
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
print("Predicted class:", "Positive" if predictions.item() == 1 else "Negative")


Predicted class: Positive


In [14]:
new_text = "The government announced new economic policies to stimulate growth."
inputs = tokenizer(new_text, return_tensors='pt')
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
print("Predicted class:", "Positive" if predictions.item() == 1 else "Negative")


Predicted class: Positive


In [15]:
new_text = "The stock market is facing a significant downturn due to the economic crisis."
inputs = tokenizer(new_text, return_tensors='pt')
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
print("Predicted class:", "Positive" if predictions.item() == 1 else "Negative")


Predicted class: Negative


In [16]:
new_text = "The company reported disappointing earnings, leading to a sharp decline in its stock price."
inputs = tokenizer(new_text, return_tensors='pt')
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
print("Predicted class:", "Positive" if predictions.item() == 1 else "Negative")

Predicted class: Negative


In [17]:
new_text = "The recent market volatility has caused investors to panic and sell their stocks."
inputs = tokenizer(new_text, return_tensors='pt')
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
print("Predicted class:", "Positive" if predictions.item() == 1 else "Negative")

Predicted class: Negative


#### Ranking

In [88]:
import praw

client_id = 'yu7KW2fhRmQBUW3LDL1X2A'
client_secret = 'VZa78pB0H3bVlwOWsnsgV3yzFBLkSw'
user_agent = 'praw:com.example.myapp:v1.0 (by /u/Sad-Information9604)'

reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent, redirect_uri='http://localhost:8000')

# البحث عن المشاركات المتعلقة بالأسواق المالية
subreddit = reddit.subreddit('stocks')  # يمكنك تخصيص هذا إلى subreddits معينة مثل 'stocks', 'finance', إلخ
query = 'The stock market is facing a significant downturn due to the economic crisis.'

# عدد المشاركات التي تريد جلبها
num_posts = 10
ne=subreddit.search(query, limit=num_posts,time_filter='day')
# البحث عن المشاركات
number_of_post=0
Total=0
for submission in ne:
    number_of_post+=1
    Total +=submission.score+submission.num_comments
    print(f"Title: {submission.title}")
    print(f"Upvotes: {submission.score}")
    print(f"Number of Comments: {submission.num_comments}")
    print("-" * 50,number_of_post)



Title: Trump: New travel barriers for Canadian tourists, the biggest source of US tourism. Expect impact on airlines, hotels, retail, restaurants
Upvotes: 2746
Number of Comments: 487
-------------------------------------------------- 1
Title: EU Targets €26 Billion of US Products in Tariff Retaliation
Upvotes: 198
Number of Comments: 25
-------------------------------------------------- 2
Title: The problem with TSLA stock
Upvotes: 0
Number of Comments: 28
-------------------------------------------------- 3
Title: These are the stocks on my watchlist (03/12)
Upvotes: 10
Number of Comments: 7
-------------------------------------------------- 4
Title: CPiI increased by 0.2% MoM and the annual rate of increase was 2.8% in Feb
Upvotes: 15
Number of Comments: 5
-------------------------------------------------- 5


In [103]:
import praw

def get_reddit_posts(client_id, client_secret, user_agent, subreddit_name, query, num_posts=20, time_filter='day'):
    
    reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent, redirect_uri='http://localhost:8000')

    subreddit = reddit.subreddit(subreddit_name)  
    posts = subreddit.search(query, limit=num_posts, time_filter=time_filter)
    
    post_data = []
    number_of_post = 0
    total = 0

    for submission in posts:
        number_of_post += 1
        score_plus_comments = submission.score + submission.num_comments
        post_data.append(score_plus_comments)
        total += score_plus_comments

    ratio = number_of_post / total if total != 0 else 0  
    
    normalized_values = []
    if post_data:  
        min_value = min(post_data)
        max_value = max(post_data)

        for value in post_data:
            if max_value != min_value:
                normalized_value = (value - min_value) / (max_value - min_value)
            else:
                normalized_value = 0  
            normalized_values.append(normalized_value)
        
    return sum(normalized_values)/number_of_post


query = 'The stock market is facing a significant downturn due to the economic crisis.'

normalized_values = get_reddit_posts(client_id, client_secret, user_agent, subreddit_name='stocks', query=query, num_posts=20, time_filter='day')

print(normalized_values)


0.2125523012552301


In [104]:
type(normalized_values)

float

In [105]:
import praw
import os

def get_reddit_posts(query):
    
    reddit = praw.Reddit(client_id='yu7KW2fhRmQBUW3LDL1X2A',
                         client_secret='VZa78pB0H3bVlwOWsnsgV3yzFBLkSw',
                         user_agent='praw:com.example.myapp:v1.0 (by /u/Sad-Information9604)', 
                         redirect_uri='http://localhost:8000')

    subreddit = reddit.subreddit('stocks')  
    posts = subreddit.search(query, limit=50, time_filter='day')
    
    post_data = []
    number_of_post = 0
    total = 0

    for submission in posts:
        number_of_post += 1
        score_plus_comments = submission.score + submission.num_comments
        post_data.append(score_plus_comments)
        total += score_plus_comments

    ratio = number_of_post / total if total != 0 else 0  
    
    normalized_values = []
    if post_data:  
        min_value = min(post_data)
        max_value = max(post_data)

        for value in post_data:
            if max_value != min_value:
                normalized_value = (value - min_value) / (max_value - min_value)
            else:
                normalized_value = 0  
            normalized_values.append(normalized_value)
        
    return sum(normalized_values)/number_of_post



def ranking(text, sentiment_score, source_credibility):
    alpha, beta, gamma = 0.5, 0.3, 0.2
    rank_score = alpha * sentiment_score + beta * get_reddit_posts(text) + gamma * source_credibility
    return rank_score


query = 'The stock market is facing a significant downturn due to the economic crisis.'
normalized_values = get_reddit_posts(query=query)
print(normalized_values)


0.21245482346399774


In [107]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

sentences = ["there is a shortage of capital, and we need extra financing",  
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]
results = nlp(sentences)
print(results)  #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative

Device set to use cpu


[{'label': 'Negative', 'score': 0.9966173768043518}, {'label': 'Positive', 'score': 1.0}, {'label': 'Negative', 'score': 0.9999710321426392}, {'label': 'Neutral', 'score': 0.9889442920684814}]
