# Stock predict system

In [1]:
!pip install "numpy<2.3"



In [2]:
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
import pandas_ta as ta
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from transformers import pipeline
# Import library files

  from .autonotebook import tqdm as notebook_tqdm


## Download the dataset

In [3]:
!git clone https://github.com/yumoxu/stocknet-dataset.git

if os.path.exists('./stocknet-dataset'):
    print("Download successful. The following folders are included.")
    print(os.listdir('./stocknet-dataset'))
else:
    print("Download failed")

Download successful. The following folders are included.
['.git', 'appendix_table_of_target_stocks.pdf', 'LICENSE', 'price', 'README.md', 'StockTable', 'tweet']


fatal: destination path 'stocknet-dataset' already exists and is not an empty directory.


+ Configure the data path

In [4]:
class StockNetPreprocessor:
    def __init__(self, root_dir, use_bert=False):
        self.root = Path(root_dir)
        self.price_dir = self.root / 'price' / 'raw'
        self.tweet_dir = self.root / 'tweet' / 'preprocessed'
        self.use_bert = use_bert

        if self.use_bert:
            from transformers import pipeline
            self.sentiment_pipe = pipeline(
                "sentiment-analysis",
                model="nlptown/bert-base-multilingual-uncased-sentiment",
                device=-1
            )

    def _parse_tweet_file(self, file_path):
        scores = []
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        tweet_obj = json.loads(line)
                        text = tweet_obj.get('text', '')

                        if self.use_bert:
                            res = self.sentiment_pipe(text[:512])[0]
                            score = int(res['label'].split()[0])
                        else:
                            score = 3
                            if 'good' in text or 'up' in text: score = 5
                            if 'bad' in text or 'down' in text: score = 1

                        scores.append(score)
                    except json.JSONDecodeError:
                        continue
        except Exception:
            return None

        if not scores:
            return None

        return {
            'Sentiment_Score': np.mean(scores),
            'Tweet_Volume': len(scores)
        }

    def process_single_ticker(self, ticker):
        price_path = self.price_dir / f"{ticker}.csv"
        if not price_path.exists():
            return None

        try:
            df_price = pd.read_csv(price_path)
            df_price['Date'] = pd.to_datetime(df_price['Date'])
            df_price.set_index('Date', inplace=True)
            df_price.sort_index(inplace=True)

            # === [关键修复] 处理列名冲突 ===
            # 如果存在 Adj Close，我们要用它来替代 Close
            if 'Adj Close' in df_price.columns:
                # 1. 如果原表中已经有 Close，先删除它，防止重名
                if 'Close' in df_price.columns:
                    df_price = df_price.drop(columns=['Close'])
                # 2. 再把 Adj Close 改名为 Close
                df_price.rename(columns={'Adj Close': 'Close'}, inplace=True)

            # 技术指标
            df_price.ta.macd(append=True)
            df_price.ta.rsi(length=14, append=True)
            df_price.ta.bbands(length=20, std=2, append=True)

            # 生成 Target
            # 这里的 df_price['Close'] 现在保证是唯一的 Series 了
            df_price['Target'] = (df_price['Close'].shift(-1) > df_price['Close']).astype(int)

        except Exception as e:
            print(f"Error processing price for {ticker}: {e}")
            return None

        # 处理推特数据
        ticker_tweet_dir = self.tweet_dir / ticker
        tweet_data = []

        if ticker_tweet_dir.exists():
            for date_file in ticker_tweet_dir.iterdir():
                if date_file.is_file():
                    try:
                        pd.to_datetime(date_file.name)
                        daily_stats = self._parse_tweet_file(date_file)
                        if daily_stats:
                            daily_stats['Date'] = pd.to_datetime(date_file.name)
                            tweet_data.append(daily_stats)
                    except:
                        continue

        if tweet_data:
            df_tweet = pd.DataFrame(tweet_data)
            df_tweet.set_index('Date', inplace=True)
            master_df = df_price.join(df_tweet, how='left')
        else:
            master_df = df_price
            master_df['Sentiment_Score'] = 3
            master_df['Tweet_Volume'] = 0

        master_df['Sentiment_Score'].fillna(3, inplace=True)
        master_df['Tweet_Volume'].fillna(0, inplace=True)
        master_df.fillna(method='ffill', inplace=True)
        master_df.dropna(inplace=True)

        master_df['Ticker'] = ticker
        return master_df

    def run_all(self, target_tickers=None):
        if target_tickers is None:
            target_tickers = [f.stem for f in self.price_dir.glob('*.csv')]

        print(f"Starting processing for {len(target_tickers)} tickers (Sequential Mode)...")

        results = []
        for ticker in tqdm(target_tickers):
            try:
                df = self.process_single_ticker(ticker)
                if df is not None:
                    results.append(df)
            except Exception as e:
                print(f"Failed to process {ticker}: {e}")

        if not results:
            print("No data processed!")
            return pd.DataFrame()

        final_df = pd.concat(results)
        return final_df

+ Check if the test data has been read successfully

In [None]:
if __name__ == "__main__":
    DATA_PATH = './stocknet-dataset'
    processor = StockNetPreprocessor(DATA_PATH, use_bert=True)

    # 再次尝试运行
    master_df = processor.run_all(target_tickers=['AAPL', 'GOOG', 'MSFT'])

    if not master_df.empty:
        master_df.to_parquet('master_dataset.parquet')
        print("\nSuccess! Data shape:", master_df.shape)
        print(master_df.head())

Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1150.63it/s, Materializing param=classifier.weight]                                      


Starting processing for 3 tickers (Sequential Mode)...


  0%|          | 0/3 [00:00<?, ?it/s]