In [4]:
# === Step 1: Import Libraries ===
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error
import yfinance as yf

from transformers import pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from google.colab import drive
from tensorflow.keras.callbacks import EarlyStopping

# === Step 2: Load Data ===
drive.mount('/content/drive')

stock_data_path = "/content/drive/MyDrive/stock_prediction/INFY.csv"
news_data_path = "/content/drive/MyDrive/stock_prediction/news_data.csv"

import os
print(os.path.exists(stock_data_path))
print(os.path.exists(news_data_path))

stock_data = pd.read_csv(stock_data_path)
news_data = pd.read_csv(news_data_path)

#print(stock_data.head())
#print(news_data.head())

stock_data['date'] = pd.to_datetime(stock_data['date'], errors='coerce')
stock_data['date'] = stock_data['date'].dt.tz_localize(None)

news_data.rename(columns={'published_at': 'date'}, inplace=True)
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce').dt.date
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')

#print("stock_data:", stock_data.head())
#print("news_data:", news_data.head())
stock_data = stock_data.sort_values(by="date")
news_data = news_data.sort_values(by="date")

stock_data = stock_data[(stock_data['date'] >= "2024-10-01") & (stock_data['date'] <= "2024-12-31")]
news_data = news_data[(news_data['date'] >= "2024-10-01") & (news_data['date'] <= "2024-12-31")]

# === Step 3: Preprocess News Data ===
news_data['title'] = news_data['title'].str.lower()
news_data['description'] = news_data['description'].str.lower()

#print(stock_data.head())
#print(news_data.head())
domains = [
    "Economy", "Technology", "Finance", "Politics", "Healthcare", "Energy", "Automotive", "Retail", "Real Estate",
    "Entertainment", "Education", "Agriculture", "Environment", "Infrastructure", "Telecom", "Defense",
    "Social Media", "Travel & Tourism", "Cryptocurrency", "Consumer Goods"
]

impact_factors = [
    "Positive Market Sentiment", "Negative Market Sentiment", "Regulatory Changes", "Policy Announcements",
    "Trade Agreements", "Inflation Data", "Interest Rate Changes", "Company Earnings Reports", "Product Launches",
    "Layoffs", "Acquisitions", "Mergers", "Partnerships", "Legal Disputes", "Scandals", "Technological Breakthroughs",
    "Cybersecurity Breaches", "Climate Reports", "Natural Disasters", "Global Conflicts", "Sanctions",
    "IPO Announcements", "Stock Buybacks", "Dividend Announcements", "Industry Growth Reports", "Bankruptcies",
    "Start-up Funding", "CEO Changes", "Pandemic-Related News", "Labor Strikes", "Supply Chain Disruptions",
    "Oil Price Fluctuations", "Commodity Price Fluctuations", "Tax Reforms", "Trade War Updates", "Retail Sales Data",
    "New Regulations", "Consumer Behavior Changes", "Market Volatility", "Export/Import Data", "Currency Exchange Rates",
    "Technology Adoption Trends", "Market Penetration Strategies", "Interest from Institutional Investors",
    "Analyst Upgrades/Downgrades", "Lawsuits Against Competitors", "International Market Trends",
    "Political Instability", "Demand-Supply Trends", "Ethical Issues"
]

def assign_domain_and_impact_factor(text):
    text = text.lower()
    matched_domains = [domain for domain in domains if domain.lower() in text]
    matched_factors = [factor for factor in impact_factors if factor.lower() in text]
    return ", ".join(matched_domains) if matched_domains else "Other", ", ".join(matched_factors) if matched_factors else "None"

news_data[['Domain', 'Impact_Factor']] = news_data.apply(
    lambda row: pd.Series(assign_domain_and_impact_factor(row['title'] + " " + row['description'])), axis=1
)
#print(news_data.head())
# === Step 4: sentiment analysis ===
sentiment_analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")

def analyze_sentiment(text):
    if pd.isna(text) or text.strip() == "":
        return 0
    result = sentiment_analyzer(text)[0]
    return result['score'] if result['label'] == 'POSITIVE' else -result['score']

news_data['sentiment'] = news_data.apply(
    lambda row: analyze_sentiment(row['title'] + " " + row['description']), axis=1
)


news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')
stock_data['date'] = pd.to_datetime(stock_data['date'], errors='coerce')

daily_news = news_data.groupby(pd.Grouper(key='date', freq='D')).agg({
    'sentiment': 'mean',
    'Domain': lambda x: ', '.join(set(x.dropna())),
    'Impact_Factor': lambda x: ', '.join(set(x.dropna()))
}).reset_index()

#print(daily_news.head())
merged_data = pd.merge(stock_data, daily_news, on='date', how='left')

merged_data['sentiment'].fillna(0, inplace=True)
merged_data['Domain'].fillna("No Domain", inplace=True)
merged_data['Impact_Factor'].fillna("No Impact Factor", inplace=True)

for domain in domains:
    merged_data[f'Domain_{domain}_Count'] = merged_data['Domain'].str.contains(domain, na=False).astype(int)

for factor in impact_factors:
    merged_data[f'Factor_{factor}_Count'] = merged_data['Impact_Factor'].str.contains(factor, na=False).astype(int)

merged_data.dropna(inplace=True)

merged_data.drop(columns=['Domain', 'Impact_Factor'], inplace=True)

#print("Merged Data:", merged_data.head())
#print("Total Rows:", len(merged_data))


# === Step 5: LSTM ===
features = merged_data.drop(columns=['date', 'close']).values
target = merged_data['close'].values

print("Shape of features:", features.shape)
print("Shape of target:", target.shape)

scaler_y = MinMaxScaler()
target_scaled = scaler_y.fit_transform(target.reshape(-1, 1)).flatten()

scaler_X = MinMaxScaler()
features_scaled = scaler_X.fit_transform(features)

def create_sequences(data, labels, sequence_length=20):
    x_seq, y_seq = [], []
    for i in range(len(data) - sequence_length):
        x_seq.append(data[i:i+sequence_length])
        y_seq.append(labels[i+sequence_length])
    return np.array(x_seq), np.array(y_seq)

x_seq, y_seq = create_sequences(features_scaled, target_scaled)

print(f"x_seq shape after creating sequences: {x_seq.shape}")

train_size = int(len(x_seq) * 0.8)
x_train, x_test = x_seq[:train_size], x_seq[train_size:]
y_train, y_test = y_seq[:train_size], y_seq[train_size:]

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])),
    Dropout(0.2),

    LSTM(32),
    Dropout(0.2),

    Dense(1)
])

model.compile(optimizer='adam', loss='mse')

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    x_train, y_train,
    epochs=50, batch_size=8,
    validation_data=(x_test, y_test),
    callbacks=[early_stopping]
)

y_pred_scaled = model.predict(x_test)
y_pred = scaler_y.inverse_transform(y_pred_scaled)

y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1))

print("First 5 Predictions:", y_pred)
#print("First 5 Actual Values:", y_test_original[:5].flatten())


# === Step 6: Prediction ===
predictions = []
x_last = x_test[-1].reshape(1, 20, 76)

for _ in range(17):


    print(x_last.shape)

    pred_price_scaled = model.predict(x_last)[0, 0]
    predictions.append(pred_price_scaled)

    new_row = np.zeros((1, 20, 76))
    new_row[0, -1, :] = features_scaled[-1]
    new_row[0, :-1, :] = x_last[0, 1:, :]
    new_row[0, -1, -1] = pred_price_scaled

    x_last = new_row

predictions_original = scaler_y.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()

last_date = stock_data['date'].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=17, freq='D')

output_df = pd.DataFrame({"date": future_dates, "Predicted_Close": predictions_original})
output_df.to_csv("output.csv", index=False)

#print(output_df)

actual_stock_df = yf.download("INFY.BO", start="2025-01-02", end="2025-01-17")[['Close']].reset_index()
actual_stock_df.rename(columns={'Date': 'date', 'Close': 'actual_close'}, inplace=True)

if isinstance(actual_stock_df.columns, pd.MultiIndex):
    actual_stock_df.columns = actual_stock_df.columns.droplevel(1)


evaluation_df = pd.merge(output_df, actual_stock_df, on='date', how='inner')

#print("Merged Data:",evaluation_df)
# Compute MAPE
mape = mean_absolute_percentage_error(evaluation_df['actual_close'], evaluation_df['Predicted_Close'])
#print(f"Mean Absolute Percentage Error (MAPE): {mape * 100:.2f}%")


#print("Predicted Prices:",output_df)
#print("Actual Stock Prices:",actual_stock_df)

print("evaluation_df columns:", evaluation_df.columns)
print("daily_news columns:", daily_news.columns)
#print(daily_news.columns)
print(daily_news.head())

#print(evaluation_df.head())

final_output_df = pd.merge(evaluation_df, daily_news, on='date', how='left')

final_output_df['MAPE (%)'] = abs((final_output_df['actual_close'] - final_output_df['Predicted_Close']) / final_output_df['actual_close']) * 100

final_output_df.rename(columns={
    'Predicted_Close': 'Predicted Close Price',
    'actual_close': 'Actual Close Price',
    'sentiment': 'Sentiment Score',
    'Domain': 'Prediction Domain',
    'Impact_Factor': 'Impact Factor'
}, inplace=True)

# Save to CSV
final_output_df.to_csv("/content/drive/MyDrive/stock_prediction/final_ouput.csv", index=False)

# Display the final dataframe
print(final_output_df)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
True
True


Device set to use cpu
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['sentiment'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['Domain'].fillna("No Domain", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obje

Shape of features: (62, 76)
Shape of target: (62,)
x_seq shape after creating sequences: (42, 20, 76)
Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 137ms/step - loss: 0.3855 - val_loss: 0.0356
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 0.0893 - val_loss: 0.0339
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.1763 - val_loss: 0.0105
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 0.0849 - val_loss: 0.0447
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 0.0748 - val_loss: 0.0538
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 0.0915 - val_loss: 0.0340
Epoch 7/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 0.0659 - val_loss: 0.0095
Epoch 8/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/

[*********************100%***********************]  1 of 1 completed

evaluation_df columns: Index(['date', 'Predicted_Close', 'actual_close'], dtype='object')
daily_news columns: Index(['date', 'sentiment', 'Domain', 'Impact_Factor'], dtype='object')
        date  ...                                      Impact_Factor
0 2024-12-24  ...                             None, Trade Agreements
1 2024-12-25  ...                                    None, Sanctions
2 2024-12-26  ...                             None, Trade Agreements
3 2024-12-27  ...  Acquisitions, Mergers, New Regulations, Merger...
4 2024-12-28  ...                            None, Market Volatility

[5 rows x 4 columns]
         date  Predicted Close Price  Actual Close Price  ...  Prediction Domain Impact Factor  MAPE (%)
0  2025-01-02            1888.265869         1957.650024  ...                NaN           NaN  3.544257
1  2025-01-03            1881.885864         1938.300049  ...                NaN           NaN  2.910498
2  2025-01-06            1865.317139         1937.849976  ...      


