# Analyzing Chinese Stock Markets: A Comprehensive Study
## Data Collection and Analysis System



### Setup and Dependencies

First, let's import all necessary libraries and set up our environment.

In [None]:
# Basic data manipulation and analysis
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

# Financial analysis
import yfinance as yf
from pandas_datareader import data as pdr
yf.pdr_override()

# Statistical analysis
from scipy import stats
from statsmodels.tsa.stattools import adfuller

# Database management
import sqlite3

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print("Setup completed successfully!")

### Database Setup

We'll create a SQLite database to store our financial data. The database will include tables for:
1. Daily stock prices and volumes
2. Company information
3. Market indices
4. Sector classifications

In [None]:
class DatabaseManager:
    def __init__(self, db_name='chinese_stocks.db'):
        """Initialize database connection and create tables"""
        self.conn = sqlite3.connect(db_name)
        self.cursor = self.conn.cursor()
        self.create_tables()
    
    def create_tables(self):
        """Create all necessary database tables"""
        # Stock price data table
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS stock_data (
                date TEXT,
                symbol TEXT,
                open REAL,
                high REAL,
                low REAL,
                close REAL,
                volume REAL,
                adj_close REAL,
                market_cap REAL,
                PRIMARY KEY (date, symbol)
            )
        ''')
        
        # Company information table
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS company_info (
                symbol TEXT PRIMARY KEY,
                name TEXT,
                sector TEXT,
                industry TEXT,
                exchange TEXT,
                description TEXT
            )
        ''')
        
        # Market indices table
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS market_indices (
                date TEXT,
                index_code TEXT,
                value REAL,
                change_pct REAL,
                PRIMARY KEY (date, index_code)
            )
        ''')
        
        self.conn.commit()
    
    def store_stock_data(self, df, symbol):
        """Store stock price data in database"""
        df_copy = df.reset_index()
        df_copy['symbol'] = symbol
        df_copy.to_sql('stock_data', self.conn, if_exists='append', index=False)
    
    def get_stock_data(self, symbol, start_date, end_date):
        """Retrieve stock data from database"""
        query = f"""
        SELECT *
        FROM stock_data
        WHERE symbol = ? AND date BETWEEN ? AND ?
        ORDER BY date
        """
        return pd.read_sql_query(query, self.conn, params=(symbol, start_date, end_date))
    
    def close(self):
        """Close database connection"""
        self.conn.close()

# Initialize database
db = DatabaseManager()
print("Database initialized successfully!")

### Stock Universe Definition

Define the list of stocks we'll analyze, including major companies from both SSE and SZSE:

In [None]:
# Define stock universes
stock_universe = {
    'SSE': {  # Shanghai Stock Exchange
        '600519.SS': 'Kweichow Moutai',
        '601318.SS': 'Ping An Insurance',
        '600036.SS': 'China Merchants Bank',
        '601398.SS': 'ICBC',
        '600276.SS': 'Jiangsu Hengrui Medicine'
    },
    'SZSE': {  # Shenzhen Stock Exchange
        '000858.SZ': 'Wuliangye Yibin',
        '000333.SZ': 'Midea Group',
        '000651.SZ': 'Gree Electric',
        '000002.SZ': 'Vanke A',
        '002594.SZ': 'BYD Company'
    }
}

# Create a DataFrame with stock information
stock_info = pd.DataFrame([
    {'symbol': symbol, 'name': name, 'exchange': exchange}
    for exchange, stocks in stock_universe.items()
    for symbol, name in stocks.items()
])

print("Stock universe defined:")
display(stock_info)

### Configuration Settings

Define global settings and parameters for our analysis:

In [None]:
class Config:
    # Date ranges
    END_DATE = datetime.now()
    START_DATE = END_DATE - timedelta(days=365*2)  # 2 years of data
    
    # Technical analysis parameters
    MOVING_AVERAGES = [20, 50, 200]  # Days for moving averages
    VOLATILITY_WINDOW = 30  # Days for volatility calculation
    
    # Visualization settings
    PLOT_STYLE = 'seaborn'
    PLOT_FIGSIZE = (15, 8)
    
    # Analysis parameters
    CORRELATION_THRESHOLD = 0.7
    ZSCORE_THRESHOLD = 2.0
    
    # Performance metrics
    RISK_FREE_RATE = 0.03  # 3% annual risk-free rate

print("Configuration settings initialized!")
print(f"Analysis period: {Config.START_DATE.date()} to {Config.END_DATE.date()}")

### Data Collection System

We'll create a robust system to fetch historical data for Chinese stocks, handle missing data, and perform initial data cleaning.

In [None]:
class DataCollector:
    def __init__(self, db_manager):
        """Initialize data collector with database manager"""
        self.db = db_manager
        self.failed_symbols = []
    
    def fetch_stock_data(self, symbol, start_date, end_date):
        """Fetch stock data from Yahoo Finance"""
        try:
            data = yf.download(symbol, start=start_date, end=end_date, progress=False)
            if len(data) > 0:
                return data
            else:
                self.failed_symbols.append(symbol)
                return None
        except Exception as e:
            print(f"Error fetching data for {symbol}: {str(e)}")
            self.failed_symbols.append(symbol)
            return None
    
    def fetch_multiple_stocks(self, symbols, start_date, end_date):
        """Fetch data for multiple stocks"""
        all_data = {}
        for symbol in symbols:
            print(f"Fetching data for {symbol}...")
            data = self.fetch_stock_data(symbol, start_date, end_date)
            if data is not None:
                all_data[symbol] = data
                self.db.store_stock_data(data, symbol)
        return all_data

# Initialize data collector
collector = DataCollector(db)

# Fetch data for all stocks
all_symbols = [symbol for stocks in stock_universe.values() for symbol in stocks.keys()]
stock_data = collector.fetch_multiple_stocks(all_symbols, Config.START_DATE, Config.END_DATE)

print(f"\nData collection completed!")
print(f"Successfully collected data for {len(stock_data)} stocks")
if collector.failed_symbols:
    print(f"Failed to collect data for {len(collector.failed_symbols)} stocks: {collector.failed_symbols}")

## Data Preprocessing

Now let's clean and preprocess the collected data:

In [None]:
class DataPreprocessor:
    def __init__(self, stock_data):
        """Initialize preprocessor with collected stock data"""
        self.stock_data = stock_data
        self.processed_data = {}
    
    def handle_missing_values(self, df):
        """Handle missing values in the dataset"""
        # Forward fill price data
        df['Close'] = df['Close'].ffill()
        df['Open'] = df['Open'].ffill()
        df['High'] = df['High'].ffill()
        df['Low'] = df['Low'].ffill()
        
        # Fill volume with 0
        df['Volume'] = df['Volume'].fillna(0)
        
        return df
    
    def calculate_returns(self, df):
        """Calculate daily and cumulative returns"""
        df['Daily_Return'] = df['Close'].pct_change()
        df['Cumulative_Return'] = (1 + df['Daily_Return']).cumprod()
        return df
    
    def calculate_volatility(self, df):
        """Calculate rolling volatility"""
        df['Volatility'] = df['Daily_Return'].rolling(window=Config.VOLATILITY_WINDOW).std() * np.sqrt(252)
        return df
    
    def calculate_moving_averages(self, df):
        """Calculate various moving averages"""
        for period in Config.MOVING_AVERAGES:
            df[f'MA_{period}'] = df['Close'].rolling(window=period).mean()
        return df
    
    def process_all_stocks(self):
        """Process all stocks in the dataset"""
        for symbol, data in self.stock_data.items():
            df = data.copy()
            
            # Apply all preprocessing steps
            df = self.handle_missing_values(df)
            df = self.calculate_returns(df)
            df = self.calculate_volatility(df)
            df = self.calculate_moving_averages(df)
            
            self.processed_data[symbol] = df
        
        return self.processed_data

# Process the data
preprocessor = DataPreprocessor(stock_data)
processed_data = preprocessor.process_all_stocks()

# Display sample of processed data
sample_symbol = list(processed_data.keys())[0]
print(f"\nSample of processed data for {sample_symbol}:")
display(processed_data[sample_symbol].head())

### Data Quality Checks

Let's perform some basic data quality checks on our processed data:

In [None]:
def perform_data_quality_checks(processed_data):
    """Perform various data quality checks"""
    quality_report = {}
    
    for symbol, data in processed_data.items():
        report = {
            'total_rows': len(data),
            'missing_values': data.isnull().sum().to_dict(),
            'negative_prices': (data['Close'] < 0).sum(),
            'zero_prices': (data['Close'] == 0).sum(),
            'date_range': f"{data.index.min()} to {data.index.max()}",
            'trading_days': len(data),
            'price_range': f"{data['Close'].min():.2f} - {data['Close'].max():.2f}"
        }
        
        # Check for extreme returns
        mean_return = data['Daily_Return'].mean()
        std_return = data['Daily_Return'].std()
        extreme_returns = data['Daily_Return'][abs(data['Daily_Return'] - mean_return) > 3 * std_return]
        report['extreme_returns_count'] = len(extreme_returns)
        
        quality_report[symbol] = report
    
    return pd.DataFrame(quality_report).T

# Run quality checks
quality_report = perform_data_quality_checks(processed_data)
print("Data Quality Report:")
display(quality_report)

### Initial Data Summary

Let's create a summary of our processed data:

In [None]:
def create_data_summary(processed_data):
    """Create summary statistics for all stocks"""
    summary_data = []
    
    for symbol, data in processed_data.items():
        summary = {
            'Symbol': symbol,
            'Start_Date': data.index.min(),
            'End_Date': data.index.max(),
            'Avg_Price': data['Close'].mean(),
            'Avg_Volume': data['Volume'].mean(),
            'Avg_Daily_Return': data['Daily_Return'].mean() * 100,
            'Volatility': data['Volatility'].mean() * 100,
            'Max_Drawdown': ((data['Close'] / data['Close'].cummax() - 1).min() * 100),
            'Total_Return': ((data['Close'][-1] / data['Close'][0] - 1) * 100)
        }
        summary_data.append(summary)
    
    return pd.DataFrame(summary_data).round(2)

# Create and display summary
data_summary = create_data_summary(processed_data)
print("Data Summary:")
display(data_summary)

### Save Processed Data

Finally, let's save our processed data back to the database for future use:

In [None]:
def save_processed_data(processed_data, db):
    """Save processed data to database"""
    for symbol, data in processed_data.items():
        # Prepare data for storage
        df_to_save = data.reset_index()
        df_to_save['symbol'] = symbol
        
        # Store in database
        df_to_save.to_sql('processed_stock_data', db.conn, if_exists='append', index=False)
    
    print("Processed data saved successfully!")

# Save the processed data
save_processed_data(processed_data, db)

### Core Analysis Classes

In [None]:
class TechnicalAnalysis:
    def __init__(self, data):
        self.data = data
    
    def calculate_RSI(self, df, periods=14):
        """Calculate Relative Strength Index"""
        delta = df['Close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=periods).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=periods).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))
    
    def calculate_MACD(self, df, short_period=12, long_period=26, signal_period=9):
        """Calculate MACD (Moving Average Convergence Divergence)"""
        short_ema = df['Close'].ewm(span=short_period, adjust=False).mean()
        long_ema = df['Close'].ewm(span=long_period, adjust=False).mean()
        macd = short_ema - long_ema
        signal = macd.ewm(span=signal_period, adjust=False).mean()
        return macd, signal
    
    def calculate_bollinger_bands(self, df, window=20, num_std=2):
        """Calculate Bollinger Bands"""
        rolling_mean = df['Close'].rolling(window=window).mean()
        rolling_std = df['Close'].rolling(window=window).std()
        upper_band = rolling_mean + (rolling_std * num_std)
        lower_band = rolling_mean - (rolling_std * num_std)
        return rolling_mean, upper_band, lower_band
    
    def add_technical_indicators(self):
        """Add all technical indicators to the data"""
        for symbol, df in self.data.items():
            # RSI
            df['RSI'] = self.calculate_RSI(df)
            
            # MACD
            df['MACD'], df['MACD_Signal'] = self.calculate_MACD(df)
            df['MACD_Histogram'] = df['MACD'] - df['MACD_Signal']
            
            # Bollinger Bands
            df['BB_Middle'], df['BB_Upper'], df['BB_Lower'] = self.calculate_bollinger_bands(df)
            
            self.data[symbol] = df
        
        return self.data

class MarketVisualizer:
    def __init__(self, analysis_data):
        self.data = analysis_data
    
    def plot_price_with_indicators(self, symbol):
        """Create a comprehensive price chart with technical indicators"""
        df = self.data[symbol]
        
        # Create figure with subplots
        fig = plt.figure(figsize=(15, 10))
        gs = fig.add_gridspec(3, 1, height_ratios=[2, 1, 1])
        
        # Price and Bollinger Bands
        ax1 = fig.add_subplot(gs[0])
        ax1.plot(df.index, df['Close'], label='Price', color='blue')
        ax1.plot(df.index, df['BB_Upper'], '--', label='Upper BB', color='gray', alpha=0.5)
        ax1.plot(df.index, df['BB_Middle'], '--', label='Middle BB', color='gray', alpha=0.5)
        ax1.plot(df.index, df['BB_Lower'], '--', label='Lower BB', color='gray', alpha=0.5)
        ax1.fill_between(df.index, df['BB_Upper'], df['BB_Lower'], alpha=0.1)
        ax1.set_title(f'{symbol} Price and Technical Indicators')
        ax1.legend()
        ax1.grid(True)
        
        # MACD
        ax2 = fig.add_subplot(gs[1])
        ax2.plot(df.index, df['MACD'], label='MACD', color='blue')
        ax2.plot(df.index, df['MACD_Signal'], label='Signal', color='orange')
        ax2.bar(df.index, df['MACD_Histogram'], label='Histogram', color='gray', alpha=0.3)
        ax2.set_title('MACD')
        ax2.legend()
        ax2.grid(True)
        
        # RSI
        ax3 = fig.add_subplot(gs[2])
        ax3.plot(df.index, df['RSI'], label='RSI', color='purple')
        ax3.axhline(y=70, color='r', linestyle='--', alpha=0.5)
        ax3.axhline(y=30, color='g', linestyle='--', alpha=0.5)
        ax3.set_title('RSI')
        ax3.legend()
        ax3.grid(True)
        
        plt.tight_layout()
        plt.show()
    
    def plot_market_comparison(self):
        """Create a comparison plot of all stocks"""
        plt.figure(figsize=(15, 6))
        
        for symbol, df in self.data.items():
            normalized_price = df['Close'] / df['Close'].iloc[0] * 100
            plt.plot(df.index, normalized_price, label=symbol)
        
        plt.title('Normalized Price Comparison (Base=100)')
        plt.xlabel('Date')
        plt.ylabel('Normalized Price')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True)
        plt.tight_layout()
        plt.show()
    
    def plot_correlation_heatmap(self):
        """Create a correlation heatmap of stock returns"""
        returns_data = {}
        for symbol, df in self.data.items():
            returns_data[symbol] = df['Daily_Return']
        
        returns_df = pd.DataFrame(returns_data)
        correlation_matrix = returns_df.corr()
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
        plt.title('Stock Returns Correlation Matrix')
        plt.tight_layout()
        plt.show()

class VolumeAnalysis:
    def __init__(self, data):
        self.data = data
    
    def analyze_volume_trends(self, symbol):
        """Analyze trading volume trends"""
        df = self.data[symbol]
        
        # Calculate volume metrics
        df['Volume_MA_10'] = df['Volume'].rolling(window=10).mean()
        df['Volume_MA_30'] = df['Volume'].rolling(window=30).mean()
        
        # Plot volume analysis
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
        
        # Price and volume
        ax1.plot(df.index, df['Close'], color='blue')
        ax1_twin = ax1.twinx()
        ax1_twin.bar(df.index, df['Volume'], alpha=0.3, color='gray')
        ax1.set_title(f'{symbol} Price and Volume')
        ax1.set_ylabel('Price')
        ax1_twin.set_ylabel('Volume')
        
        # Volume moving averages
        ax2.plot(df.index, df['Volume_MA_10'], label='10-day MA', color='blue')
        ax2.plot(df.index, df['Volume_MA_30'], label='30-day MA', color='orange')
        ax2.bar(df.index, df['Volume'], alpha=0.3, color='gray')
        ax2.set_title('Volume Analysis')
        ax2.legend()
        
        plt.tight_layout()
        plt.show()
        
        # Calculate volume statistics
        volume_stats = {
            'Average Daily Volume': df['Volume'].mean(),
            'Max Daily Volume': df['Volume'].max(),
            'Volume Volatility': df['Volume'].std() / df['Volume'].mean(),
            'Volume Trend': (df['Volume_MA_10'][-1] / df['Volume_MA_30'][-1] - 1) * 100
        }
        
        return pd.Series(volume_stats)
    
    def analyze_all_stocks(self):
        """Analyze volume trends for all stocks"""
        all_stats = {}
        for symbol in self.data.keys():
            all_stats[symbol] = self.analyze_volume_trends(symbol)
        
        return pd.DataFrame(all_stats).T

class PatternAnalysis:
    def __init__(self, data):
        self.data = data
    
    def identify_trends(self, df, window=20):
        """Identify market trends using moving averages"""
        df['Trend'] = 'Neutral'
        df.loc[df['Close'] > df[f'MA_{window}'], 'Trend'] = 'Uptrend'
        df.loc[df['Close'] < df[f'MA_{window}'], 'Trend'] = 'Downtrend'
        return df
    
    def detect_swings(self, df, threshold=0.02):
        """Detect major price swings"""
        df['Price_Change'] = df['Close'].pct_change()
        df['Swing'] = 'None'
        
        # Identify significant up and down moves
        df.loc[df['Price_Change'] > threshold, 'Swing'] = 'Up'
        df.loc[df['Price_Change'] < -threshold, 'Swing'] = 'Down'
        
        return df
    
    def find_support_resistance(self, df, window=20):
        """Find potential support and resistance levels"""
        df['Support'] = df['Low'].rolling(window=window, center=True).min()
        df['Resistance'] = df['High'].rolling(window=window, center=True).max()
        
        # Identify key levels
        price_range = df['High'].max() - df['Low'].min()
        threshold = price_range * 0.02  # 2% of total range
        
        support_levels = []
        resistance_levels = []
        
        for level in df['Support'].dropna().unique():
            if sum(abs(df['Low'] - level) < threshold) > window/2:
                support_levels.append(level)
        
        for level in df['Resistance'].dropna().unique():
            if sum(abs(df['High'] - level) < threshold) > window/2:
                resistance_levels.append(level)
                
        return support_levels, resistance_levels
    
    def analyze_patterns(self, symbol):
        """Perform comprehensive pattern analysis for a stock"""
        df = self.data[symbol].copy()
        
        # Identify trends
        df = self.identify_trends(df)
        
        # Detect swings
        df = self.detect_swings(df)
        
        # Find support and resistance
        support_levels, resistance_levels = self.find_support_resistance(df)
        
        # Visualize patterns
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
        
        # Price with support and resistance
        ax1.plot(df.index, df['Close'], label='Price')
        for level in support_levels:
            ax1.axhline(y=level, color='g', linestyle='--', alpha=0.5)
        for level in resistance_levels:
            ax1.axhline(y=level, color='r', linestyle='--', alpha=0.5)
        
        ax1.set_title(f'{symbol} Price with Support/Resistance Levels')
        ax1.legend()
        ax1.grid(True)
        
        # Trend analysis
        colors = {'Uptrend': 'g', 'Downtrend': 'r', 'Neutral': 'gray'}
        for trend in colors:
            mask = df['Trend'] == trend
            ax2.scatter(df[mask].index, df[mask]['Close'], 
                       c=colors[trend], label=trend, alpha=0.5)
        
        ax2.set_title('Trend Analysis')
        ax2.legend()
        ax2.grid(True)
        
        plt.tight_layout()
        plt.show()
        
        # Calculate pattern statistics
        pattern_stats = {
            'Uptrend_Days': sum(df['Trend'] == 'Uptrend'),
            'Downtrend_Days': sum(df['Trend'] == 'Downtrend'),
            'Neutral_Days': sum(df['Trend'] == 'Neutral'),
            'Major_Swings_Up': sum(df['Swing'] == 'Up'),
            'Major_Swings_Down': sum(df['Swing'] == 'Down'),
            'Support_Levels': len(support_levels),
            'Resistance_Levels': len(resistance_levels)
        }
        
        return pd.Series(pattern_stats)
    
    def analyze_all_stocks(self):
        """Analyze patterns for all stocks"""
        pattern_stats = {}
        for symbol in self.data.keys():
            print(f"\nAnalyzing patterns for {symbol}...")
            pattern_stats[symbol] = self.analyze_patterns(symbol)
        
        return pd.DataFrame(pattern_stats).T

class RiskAnalysis:
    def __init__(self, data):
        self.data = data
    
    def calculate_risk_metrics(self, df):
        """Calculate various risk metrics"""
        # Volatility metrics
        daily_returns = df['Daily_Return']
        annual_volatility = daily_returns.std() * np.sqrt(252)
        
        # Downside risk
        downside_returns = daily_returns[daily_returns < 0]
        downside_volatility = downside_returns.std() * np.sqrt(252)
        
        # Value at Risk (VaR)
        var_95 = np.percentile(daily_returns, 5)
        var_99 = np.percentile(daily_returns, 1)
        
        # Maximum drawdown
        rolling_max = df['Close'].expanding().max()
        drawdowns = df['Close']/rolling_max - 1
        max_drawdown = drawdowns.min()
        
        # Sharpe ratio (assuming risk-free rate of 3%)
        excess_returns = daily_returns - Config.RISK_FREE_RATE/252
        sharpe_ratio = np.sqrt(252) * excess_returns.mean() / excess_returns.std()
        
        return pd.Series({
            'Annual_Volatility': annual_volatility,
            'Downside_Volatility': downside_volatility,
            'VaR_95': var_95,
            'VaR_99': var_99,
            'Max_Drawdown': max_drawdown,
            'Sharpe_Ratio': sharpe_ratio
        })
    
    def plot_risk_analysis(self, symbol):
        """Create risk analysis visualizations"""
        df = self.data[symbol]
        
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
        
        # Rolling volatility
        df['Rolling_Vol'] = df['Daily_Return'].rolling(window=30).std() * np.sqrt(252)
        ax1.plot(df.index, df['Rolling_Vol'])
        ax1.set_title('30-Day Rolling Volatility')
        ax1.grid(True)
        
        # Return distribution
        df['Daily_Return'].hist(bins=50, ax=ax2)
class RiskAnalysis:
    def __init__(self, data):
        self.data = data
    
    def calculate_risk_metrics(self, df):
        """Calculate various risk metrics"""
        # Volatility metrics
        daily_returns = df['Daily_Return']
        annual_volatility = daily_returns.std() * np.sqrt(252)
        
        # Downside risk
        downside_returns = daily_returns[daily_returns < 0]
        downside_volatility = downside_returns.std() * np.sqrt(252)
        
        # Value at Risk (VaR)
        var_95 = np.percentile(daily_returns, 5)
        var_99 = np.percentile(daily_returns, 1)
        
        # Maximum drawdown
        rolling_max = df['Close'].expanding().max()
        drawdowns = df['Close']/rolling_max - 1
        max_drawdown = drawdowns.min()
        
        # Sharpe ratio (assuming risk-free rate of 3%)
        excess_returns = daily_returns - Config.RISK_FREE_RATE/252
        sharpe_ratio = np.sqrt(252) * excess_returns.mean() / excess_returns.std()
        
        return pd.Series({
            'Annual_Volatility': annual_volatility,
            'Downside_Volatility': downside_volatility,
            'VaR_95': var_95,
            'VaR_99': var_99,
            'Max_Drawdown': max_drawdown,
            'Sharpe_Ratio': sharpe_ratio
        })
    
    def plot_risk_analysis(self, symbol):
        """Create risk analysis visualizations"""
        df = self.data[symbol]
        
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
        
        # Rolling volatility
        df['Rolling_Vol'] = df['Daily_Return'].rolling(window=30).std() * np.sqrt(252)
        ax1.plot(df.index, df['Rolling_Vol'])
        ax1.set_title('30-Day Rolling Volatility')
        ax1.grid(True)
        
        # Return distribution
        df['Daily_Return'].hist(bins=50, ax=ax2)
        ax2.set_title('Return Distribution')
        
        # Drawdown analysis
        rolling_max = df['Close'].expanding().max()
        drawdowns = df['Close']/rolling_max - 1
        ax3.fill_between(df.index, drawdowns, 0, color='red', alpha=0.3)
        ax3.set_title('Drawdown Analysis')
        ax3.grid(True)
        
        # Q-Q plot
        stats.probplot(df['Daily_Return'].dropna(), dist='norm', plot=ax4)
        ax4.set_title('Q-Q Plot')
        
        plt.tight_layout()
        plt.show()
    
    def analyze_all_stocks(self):
        """Perform risk analysis for all stocks"""
        risk_metrics = {}
        for symbol in self.data.keys():
            print(f"\nAnalyzing risk metrics for {symbol}...")
            risk_metrics[symbol] = self.calculate_risk_metrics(self.data[symbol])
            self.plot_risk_analysis(symbol)
        
        return pd.DataFrame(risk_metrics).T

# Example usage
def main():
    # Assuming you have your stock data loaded into a dictionary called 'processed_data'
    # processed_data = {'SYMBOL': pd.DataFrame(...), ...}
    
    # Initialize and run technical analysis
    ta = TechnicalAnalysis(processed_data)
    analysis_data = ta.add_technical_indicators()
    
    # Create visualizations
    visualizer = MarketVisualizer(analysis_data)
    sample_symbol = list(analysis_data.keys())[0]
    visualizer.plot_price_with_indicators(sample_symbol)
    visualizer.plot_market_comparison()
    visualizer.plot_correlation_heatmap()
    
    # Perform volume analysis
    volume_analyzer = VolumeAnalysis(analysis_data)
    volume_stats = volume_analyzer.analyze_all_stocks()
    print("\nVolume Analysis Statistics:")
    print(volume_stats.round(2))
    
    # Perform pattern analysis
    pattern_analyzer = PatternAnalysis(analysis_data)
    pattern_stats = pattern_analyzer.analyze_all_stocks()
    print("\nPattern Analysis Statistics:")
    print(pattern_stats)
    
    # Perform risk analysis
    risk_analyzer = RiskAnalysis(analysis_data)
    risk_metrics = risk_analyzer.analyze_all_stocks()
    print("\nRisk Analysis Metrics:")
    print(risk_metrics.round(4))

if __name__ == "__main__":
    main()

## Exchange and Sector Analysis Implementation

In [None]:
class SectorAnalysis:
    def __init__(self, data, stock_info):
        self.data = data
        self.stock_info = stock_info
    
    def calculate_sector_performance(self):
        """Calculate performance metrics by sector"""
        sector_returns = {}
        sector_metrics = {}
        
        for exchange in ['SSE', 'SZSE']:
            exchange_stocks = self.stock_info[self.stock_info['exchange'] == exchange]
            
            # Calculate average returns for exchange
            exchange_returns = pd.DataFrame()
            for symbol in exchange_stocks['symbol']:
                if symbol in self.data:
                    exchange_returns[symbol] = self.data[symbol]['Daily_Return']
            
            sector_returns[exchange] = exchange_returns.mean(axis=1)
            
            # Calculate performance metrics
            total_return = (1 + sector_returns[exchange]).cumprod()[-1] - 1
            annualized_return = (1 + total_return) ** (252/len(sector_returns[exchange])) - 1
            volatility = sector_returns[exchange].std() * np.sqrt(252)
            sharpe = (annualized_return - Config.RISK_FREE_RATE) / volatility
            
            sector_metrics[exchange] = {
                'Total_Return': total_return * 100,
                'Annualized_Return': annualized_return * 100,
                'Volatility': volatility * 100,
                'Sharpe_Ratio': sharpe,
                'Number_of_Stocks': len(exchange_stocks)
            }
        
        return pd.DataFrame(sector_metrics).T, sector_returns
    
    def plot_sector_analysis(self, sector_returns):
        """Create sector analysis visualizations"""
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
        
        # Cumulative returns comparison
        cumulative_returns = {}
        for exchange in sector_returns:
            cumulative_returns[exchange] = (1 + sector_returns[exchange]).cumprod()
            ax1.plot(cumulative_returns[exchange].index, 
                    cumulative_returns[exchange], 
                    label=exchange)
        
        ax1.set_title('Cumulative Returns by Exchange')
        ax1.legend()
        ax1.grid(True)
        
        # Rolling correlation
        correlation = pd.DataFrame({
            'SSE': sector_returns['SSE'],
            'SZSE': sector_returns['SZSE']
        }).rolling(window=30).corr().unstack()[1]
        
        ax2.plot(correlation.index, correlation.values)
        ax2.set_title('30-Day Rolling Correlation between Exchanges')
        ax2.grid(True)
        
        plt.tight_layout()
        plt.show()
    
    def analyze_exchange_characteristics(self):
        """Analyze characteristics of each exchange"""
        exchange_stats = {}
        
        for exchange in ['SSE', 'SZSE']:
            exchange_stocks = self.stock_info[self.stock_info['exchange'] == exchange]
            
            # Calculate market cap statistics
            market_cap_stats = exchange_stocks['market_cap'].describe()
            
            # Calculate average daily volume
            volume_data = []
            for symbol in exchange_stocks['symbol']:
                if symbol in self.data:
                    volume_data.append(self.data[symbol]['Volume'].mean())
            
            avg_volume = np.mean(volume_data) if volume_data else 0
            
            exchange_stats[exchange] = {
                'Total_Market_Cap': exchange_stocks['market_cap'].sum(),
                'Average_Market_Cap': market_cap_stats['mean'],
                'Median_Market_Cap': market_cap_stats['50%'],
                'Largest_Stock': market_cap_stats['max'],
                'Smallest_Stock': market_cap_stats['min'],
                'Average_Daily_Volume': avg_volume
            }
        
        return pd.DataFrame(exchange_stats).T
    
    def plot_exchange_composition(self):
        """Plot the composition of each exchange"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        for i, exchange in enumerate(['SSE', 'SZSE']):
            exchange_stocks = self.stock_info[self.stock_info['exchange'] == exchange]
            
            # Market cap distribution
            exchange_stocks['market_cap'].hist(bins=50, ax=ax1 if i == 0 else ax2)
            ax1.set_title('SSE Market Cap Distribution')
            ax2.set_title('SZSE Market Cap Distribution')
            
        plt.tight_layout()
        plt.show()
    
    def analyze_sectors(self):
        """Perform comprehensive sector analysis"""
        print("Calculating sector performance metrics...")
        metrics, returns = self.calculate_sector_performance()
        
        print("\nSector Performance Metrics:")
        display(metrics.round(2))
        
        print("\nPlotting sector analysis...")
        self.plot_sector_analysis(returns)
        
        print("\nAnalyzing exchange characteristics...")
        characteristics = self.analyze_exchange_characteristics()
        
        print("\nExchange Characteristics:")
        display(characteristics.round(2))
        
        print("\nPlotting exchange composition...")
        self.plot_exchange_composition()
        
        return metrics, characteristics

# Add this to the main function from the previous code
def main():
    # Previous code remains the same...
    
    # Perform sector analysis (assuming you have stock_info DataFrame)
    sector_analyzer = SectorAnalysis(analysis_data, stock_info)
    sector_metrics, exchange_characteristics = sector_analyzer.analyze_sectors()

### Factor Analysis Implementation

In [None]:
class FactorAnalysis:
    def __init__(self, data):
        self.data = data
        
    def calculate_size_factor(self, df):
        """Calculate size factor based on market cap"""
        return np.log(df['Close'] * df['Volume'])
    
    def calculate_momentum_factor(self, df, lookback=252):
        """Calculate momentum factor"""
        return df['Close'].pct_change(lookback)
    
    def calculate_volatility_factor(self, df, window=30):
        """Calculate volatility factor"""
        return df['Daily_Return'].rolling(window=window).std() * np.sqrt(252)
    
    def calculate_value_factor(self, df):
        """Calculate value factor using price trends"""
        return df['Close'] / df['MA_200']
    
    def compute_factor_returns(self):
        """Compute returns for each factor"""
        factor_returns = {}
        
        for symbol, df in self.data.items():
            # Calculate factors
            factors = pd.DataFrame({
                'Size': self.calculate_size_factor(df),
                'Momentum': self.calculate_momentum_factor(df),
                'Volatility': self.calculate_volatility_factor(df),
                'Value': self.calculate_value_factor(df)
            })
            
            # Calculate factor returns
            for factor in factors.columns:
                # Sort stocks by factor and create portfolios
                factor_quintiles = pd.qcut(factors[factor], 5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'])
                factor_returns[f'{symbol}_{factor}'] = pd.Series(
                    df['Daily_Return'].values,
                    index=df.index,
                    name=factor_quintiles.name
                )
        
        return pd.DataFrame(factor_returns)
    
    def analyze_factor_exposure(self):
        """Analyze factor exposures for each stock"""
        factor_exposures = {}
        
        for symbol, df in self.data.items():
            exposures = {
                'Size_Exposure': self.calculate_size_factor(df).mean(),
                'Momentum_Exposure': self.calculate_momentum_factor(df).mean(),
                'Volatility_Exposure': self.calculate_volatility_factor(df).mean(),
                'Value_Exposure': self.calculate_value_factor(df).mean()
            }
            factor_exposures[symbol] = exposures
        
        return pd.DataFrame(factor_exposures).T
    
    def plot_factor_analysis(self):
        """Create factor analysis visualizations"""
        factor_returns = self.compute_factor_returns()
        factor_exposures = self.analyze_factor_exposure()
        
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
        
        # Factor returns over time
        cumulative_returns = (1 + factor_returns).cumprod()
        ax1.plot(cumulative_returns.index, cumulative_returns.mean(axis=1))
        ax1.set_title('Cumulative Factor Returns')
        ax1.grid(True)
        
        # Factor exposures heatmap
        sns.heatmap(factor_exposures, annot=True, cmap='RdYlBu', center=0, ax=ax2)
        ax2.set_title('Factor Exposures by Stock')
        
        # Factor correlations
        factor_corr = factor_returns.corr()
        sns.heatmap(factor_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, ax=ax3)
        ax3.set_title('Factor Correlations')
        
        # Factor volatility
        factor_vol = factor_returns.std() * np.sqrt(252)
        factor_vol.plot(kind='bar', ax=ax4)
        ax4.set_title('Factor Volatility (Annualized)')
        ax4.grid(True)
        
        plt.tight_layout()
        plt.show()
        
        return factor_exposures, factor_returns

# Perform factor analysis
factor_analyzer = FactorAnalysis(analysis_data)
factor_exposures, factor_returns = factor_analyzer.plot_factor_analysis()

print("\nFactor Analysis Results:")
print("\nFactor Exposures:")
display(factor_exposures.round(4))

## Machine Learning Extensions for Stock Market Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

class MachineLearningAnalysis:
    def __init__(self, processed_data, stock_info):
        """
        Initialize ML analysis with processed stock data
        
        Args:
            processed_data (dict): Dictionary of processed stock DataFrames
            stock_info (pd.DataFrame): Information about stocks
        """
        self.data = processed_data
        self.stock_info = stock_info
        self.features = ['Close', 'Open', 'High', 'Low', 'Volume', 
                         'Daily_Return', 'Volatility', 
                         'MA_20', 'MA_50', 'MA_200']
    
    def prepare_ml_dataset(self, symbol, look_back=30, future_target='Close'):
        """
        Prepare dataset for machine learning models
        
        Args:
            symbol (str): Stock symbol
            look_back (int): Number of previous days to use as features
            future_target (str): Target variable to predict
        
        Returns:
            X (np.array): Feature matrix
            y (np.array): Target variable
        """
        df = self.data[symbol].copy()
        
        # Select features
        feature_data = df[self.features].dropna()
        
        # Create sliding window features
        X, y = [], []
        for i in range(len(feature_data) - look_back):
            X.append(feature_data.iloc[i:i+look_back].values)
            y.append(feature_data.iloc[i+look_back][future_target])
        
        return np.array(X), np.array(y)
    
    def random_forest_prediction(self, symbol):
        """
        Use Random Forest for stock price prediction
        
        Args:
            symbol (str): Stock symbol
        
        Returns:
            dict: Prediction metrics and model performance
        """
        X, y = self.prepare_ml_dataset(symbol)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
        X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)
        
        # Random Forest Regressor
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train_scaled.reshape(X_train.shape[0], -1), y_train)
        
        # Predictions
        y_pred = rf_model.predict(X_test_scaled.reshape(X_test.shape[0], -1))
        
        # Performance metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        return {
            'model': 'Random Forest',
            'symbol': symbol,
            'RMSE': rmse,
            'Predictions': y_pred,
            'Actual': y_test
        }
    
    def lstm_price_prediction(self, symbol):
        """
        Use LSTM for stock price prediction
        
        Args:
            symbol (str): Stock symbol
        
        Returns:
            dict: LSTM model performance and predictions
        """
        X, y = self.prepare_ml_dataset(symbol)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
        
        # Scale features
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
        X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)
        
        # LSTM Model
        model = Sequential([
            LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
            Dropout(0.2),
            LSTM(50, activation='relu'),
            Dense(1)
        ])
        
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
        history = model.fit(X_train_scaled, y_train, 
                            epochs=50, 
                            batch_size=32, 
                            validation_split=0.2, 
                            verbose=0)
        
        # Predictions
        y_pred = model.predict(X_test_scaled).flatten()
        
        # Performance metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        return {
            'model': 'LSTM',
            'symbol': symbol,
            'RMSE': rmse,
            'Predictions': y_pred,
            'Actual': y_test,
            'Training_History': history.history
        }
    
    def stock_clustering(self):
        """
        Perform stock clustering based on features
        
        Returns:
            pd.DataFrame: Cluster assignments for stocks
        """
        # Aggregate features across stocks
        cluster_features = []
        for symbol, df in self.data.items():
            features = df[self.features].dropna().mean()
            features['Symbol'] = symbol
            cluster_features.append(features)
        
        cluster_df = pd.DataFrame(cluster_features)
        
        # Prepare data for clustering
        X_cluster = cluster_df[self.features].values
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_cluster)
        
        # Apply PCA for dimensionality reduction
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_scaled)
        
        # K-Means Clustering
        kmeans = KMeans(n_clusters=3, random_state=42)
        cluster_labels = kmeans.fit_predict(X_scaled)
        
        # Add cluster labels to DataFrame
        cluster_df['Cluster'] = cluster_labels
        
        # Visualize clusters
        plt.figure(figsize=(10, 6))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], 
                               c=cluster_labels, 
                               cmap='viridis')
        plt.title('Stock Clustering using K-Means')
        plt.xlabel('First Principal Component')
        plt.ylabel('Second Principal Component')
        plt.colorbar(scatter)
        plt.tight_layout()
        plt.show()
        
        return cluster_df
    
    def trend_classification(self, symbol):
        """
        Classify stock price trends
        
        Args:
            symbol (str): Stock symbol
        
        Returns:
            dict: Classification performance metrics
        """
        df = self.data[symbol].copy()
        
        # Create binary trend classification
        df['Trend'] = np.where(df['Daily_Return'] > 0, 1, 0)
        
        # Select features for classification
        features = df[self.features].dropna()
        target = df['Trend'].dropna()
        
        # Match feature and target lengths
        features = features.iloc[:len(target)]
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, shuffle=False)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Logistic Regression Classifier
        lr_model = LogisticRegression()
        lr_model.fit(X_train_scaled, y_train)
        
        # Predictions
        y_pred = lr_model.predict(X_test_scaled)
        
        # Performance metrics
        accuracy = accuracy_score(y_test, y_pred)
        
        return {
            'model': 'Logistic Regression',
            'symbol': symbol,
            'Accuracy': accuracy,
            'Classification_Report': classification_report(y_test, y_pred)
        }
    
    def run_comprehensive_ml_analysis(self):
        """
        Run comprehensive machine learning analysis
        
        Returns:
            dict: Results of various ML analyses
        """
        results = {}
        
        # Perform analyses for each stock
        for symbol in self.data.keys():
            print(f"\nAnalyzing {symbol}...")
            
            # Random Forest Prediction
            rf_results = self.random_forest_prediction(symbol)
            
            # LSTM Prediction
            lstm_results = self.lstm_price_prediction(symbol)
            
            # Trend Classification
            classification_results = self.trend_classification(symbol)
            
            results[symbol] = {
                'Random_Forest': rf_results,
                'LSTM': lstm_results,
                'Classification': classification_results
            }
        
        # Perform stock clustering
        cluster_results = self.stock_clustering()
        results['Clustering'] = cluster_results
        
        return results

# Extend main function to include ML analysis
def main():
    # Previous code remains the same...
    
    # Perform ML analysis
    ml_analyzer = MachineLearningAnalysis(analysis_data, stock_info)
    ml_results = ml_analyzer.run_comprehensive_ml_analysis()
    
    # Display summary of results
    print("\nMachine Learning Analysis Summary:")
    for symbol, results in ml_results.items():
        if symbol != 'Clustering':
            print(f"\n{symbol}:")
            print(f"Random Forest RMSE: {results['Random_Forest']['RMSE']:.4f}")
            print(f"LSTM RMSE: {results['LSTM']['RMSE']:.4f}")
            print(f"Classification Accuracy: {results['Classification']['Accuracy']:.2%}")
    
    # Display clustering results
    print("\nStock Clustering:")
    print(ml_results['Clustering'])