In [1]:

import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import os
from typing import List, Dict, Any
import ast


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

class DataCollector:
    
    def load_existing_data(self, filepath: str) -> List[Dict[str, Any]]:
        """
        Load existing review data from a JSON file.
        
        Args:
            filepath: Path to the JSON file
            
        Returns:
            List of review dictionaries
        """
        try:
            with open(filepath, 'r') as f:
                data = json.load(f)
            
            # Flatten the nested structure
            reviews = []
            for course_reviews in data:
                reviews.extend(course_reviews)
            
            #logger.info(f"Successfully loaded {len(reviews)} reviews from {filepath}")
            return reviews
        except Exception as e:
            #logger.error(f"Error loading data: {e}")
            raise
    
    def parse_json_string(self, json_str):
        """
        Parse a JSON string into a dictionary.
        
        Args:
            json_str: JSON string to parse
            
        Returns:
            Dictionary containing the parsed data
        """
        try:
            if pd.isna(json_str):
                return {}
            # Use ast.literal_eval to safely evaluate the string representation of a dictionary
            return ast.literal_eval(json_str)
        except:
            return {}
    
    def save_data(self, data: List[Dict[str, Any]], filepath: str):
        """
        Save data to a CSV file.
        
        Args:
            data: List of review dictionaries
            filepath: Output file path
        """
        try:
            # Convert list of dictionaries to DataFrame
            df = pd.DataFrame(data)
            
            # Clean and standardize columns
            df = df.rename(columns={
                'Quality': 'rating',
                'Difficulty': 'difficulty',
                'Grade': 'grade',
                'Comment': 'text',
                'professor': 'professor_name'
            })
            
            # Convert numeric columns
            df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
            df['difficulty'] = pd.to_numeric(df['difficulty'], errors='coerce')
            
            # Save to CSV
            df.to_csv(filepath, index=False)
            #logger.info(f"Saved {len(df)} reviews to {filepath}")
        except Exception as e:
            #logger.error(f"Error saving data: {e}")
            raise
    
    def load_and_split_data(self):
        """
        Load existing data and split into training and test sets.
        """
        try:
            # Load data
            data = self.load_existing_data("datasets/all_reviews.json")
            
            # Split into training and test sets (80-20 split)
            train_size = int(len(data) * 0.8)
            training_data = data[:train_size]
            test_data = data[train_size:]
            
            # Save the splits
            self.save_data(training_data, "data/raw/training_reviews.csv")
            self.save_data(test_data, "data/raw/test_reviews.csv")
            
        except Exception as e:
            #logger.error(f"Error in load_and_split_data: {e}")
            raise

def main():
    # Example usage
    collector = DataCollector()
    collector.load_and_split_data()

if __name__ == "__main__":
    main() 

In [5]:
import pandas as pd
import numpy as np
from typing import Tuple
import logging
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('wordnet')
nltk.download('stopwords')

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataPreprocessor:
    def __init__(self):
        # Initialize NLTK components
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
    def load_data(self, filepath: str) -> pd.DataFrame:
        """
        Load the raw data from CSV file.
        
        Args:
            filepath: Path to the CSV file
            
        Returns:
            DataFrame containing the raw data
        """
        try:
            df = pd.read_csv(filepath)
            logger.info(f"Successfully loaded data from {filepath}")
            return df
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise
    
    def clean_text(self, text: str) -> str:
        """
        Clean and preprocess text data.
        
        Args:
            text: Raw text string
            
        Returns:
            Cleaned text string
        """
        if not isinstance(text, str):
            return ""
            
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Split into words
        words = text.split()
        
        # Remove stopwords and lemmatize
        cleaned_words = [
            self.lemmatizer.lemmatize(word)
            for word in words
            if word not in self.stop_words
        ]
        
        return ' '.join(cleaned_words)
    
    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocess the entire dataset.
        
        Args:
            df: Raw DataFrame
            
        Returns:
            Preprocessed DataFrame
        """
        # Create a copy to avoid modifying the original
        df_processed = df.copy()
        
        # Clean text data
        if 'text' in df_processed.columns:
            df_processed['cleaned_text'] = df_processed['text'].apply(self.clean_text)
        
        # Handle missing values
        df_processed = df_processed.dropna(subset=['rating', 'difficulty'])
        
        # Convert grade to numerical values if needed
        if 'grade' in df_processed.columns:
            grade_mapping = {
                'A+': 4.0, 'A': 4.0, 'A-': 3.7,
                'B+': 3.3, 'B': 3.0, 'B-': 2.7,
                'C+': 2.3, 'C': 2.0, 'C-': 1.7,
                'D+': 1.3, 'D': 1.0, 'D-': 0.7,
                'F': 0.0
            }
            df_processed['grade_numerical'] = df_processed['grade'].map(grade_mapping)
        
        return df_processed
    
    def save_processed_data(self, df: pd.DataFrame, filepath: str):
        """
        Save processed data to CSV file.
        
        Args:
            df: Processed DataFrame
            filepath: Output file path
        """
        try:
            df.to_csv(filepath, index=False)
            logger.info(f"Successfully saved processed data to {filepath}")
        except Exception as e:
            logger.error(f"Error saving processed data: {e}")
            raise

def main():
    # # Example usage
    preprocessor = DataPreprocessor()
    
    # # Load raw data
    # raw_data = preprocessor.load_data("data/raw/professor_reviews.csv")
    
    # # Preprocess data
    # processed_data = preprocessor.preprocess_data(raw_data)
    
    # # Save processed data
    # preprocessor.save_processed_data(processed_data, "data/processed/processed_reviews.csv")

if __name__ == "__main__":
    main() 

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/markmalysa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/markmalysa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import logging
from typing import Tuple, Dict, Any
import pickle

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SentimentAnalyzer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.model = None
        self.model_type = None
    
    def prepare_data(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """
        Prepare data for sentiment analysis.
        
        Args:
            df: DataFrame containing cleaned text and ratings
            
        Returns:
            Tuple of (X, y) for model training
        """
        # Convert ratings to binary sentiment (positive/negative)
        df['sentiment'] = df['rating'].apply(lambda x: 1 if x >= 3.5 else 0)
        
        # Vectorize text
        if not hasattr(self.vectorizer, 'vocabulary_'):
            # Only fit the vectorizer if it hasn't been fit before
            X = self.vectorizer.fit_transform(df['cleaned_text'])
        else:
            # Use transform only if the vectorizer was already fit
            X = self.vectorizer.transform(df['cleaned_text'])
            
        y = df['sentiment'].values
        
        return X, y
    
    def train_model(self, X: np.ndarray, y: np.ndarray, model_type: str = 'nb') -> Dict[str, Any]:
        """
        Train a sentiment analysis model.
        
        Args:
            X: Feature matrix
            y: Target vector
            model_type: Type of model to train ('nb' for Naive Bayes, 'lr' for Logistic Regression)
            
        Returns:
            Dictionary containing model performance metrics
        """
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Train model
        if model_type == 'nb':
            self.model = MultinomialNB()
            self.model_type = 'Naive Bayes'
        elif model_type == 'lr':
            self.model = LogisticRegression(max_iter=1000)
            self.model_type = 'Logistic Regression'
        else:
            raise ValueError("Invalid model type. Use 'nb' or 'lr'")
        
        self.model.fit(X_train, y_train)
        
        # Evaluate model
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        logger.info(f"Model: {self.model_type}")
        logger.info(f"Accuracy: {accuracy:.4f}")
        logger.info(f"F1 Score: {f1:.4f}")
        logger.info("\nClassification Report:")
        logger.info(classification_report(y_test, y_pred))
        
        return {
            'accuracy': accuracy,
            'f1_score': f1,
            'classification_report': classification_report(y_test, y_pred)
        }
    
    def evaluate_model(self, X: np.ndarray, y: np.ndarray) -> Dict[str, Any]:
        """
        Evaluate the trained model on new data.
        
        Args:
            X: Feature matrix
            y: Target vector
            
        Returns:
            Dictionary containing model performance metrics
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train_model first.")
        
        # Make predictions
        y_pred = self.model.predict(X)
        accuracy = accuracy_score(y, y_pred)
        f1 = f1_score(y, y_pred)
        
        logger.info(f"Model Evaluation ({self.model_type}):")
        logger.info(f"Accuracy: {accuracy:.4f}")
        logger.info(f"F1 Score: {f1:.4f}")
        logger.info("\nClassification Report:")
        logger.info(classification_report(y, y_pred))
        
        return {
            'accuracy': accuracy,
            'f1_score': f1,
            'classification_report': classification_report(y, y_pred)
        }
    
    def predict_sentiment(self, text: str) -> Tuple[int, float]:
        """
        Predict sentiment for a new text.
        
        Args:
            text: Text to analyze
            
        Returns:
            Tuple of (predicted class, probability)
        """
        if self.model is None:
            raise ValueError("Model not trained. Call train_model first.")
        
        # Vectorize text
        X = self.vectorizer.transform([text])
        
        # Predict
        pred_class = self.model.predict(X)[0]
        pred_prob = self.model.predict_proba(X)[0][1]
        
        return pred_class, pred_prob
    
    def save_model(self, model_path: str, vectorizer_path: str):
        """
        Save trained model and vectorizer.
        
        Args:
            model_path: Path to save model
            vectorizer_path: Path to save vectorizer
        """
        try:
            with open(model_path, 'wb') as f:
                pickle.dump(self.model, f)
            with open(vectorizer_path, 'wb') as f:
                pickle.dump(self.vectorizer, f)
            logger.info(f"Model and vectorizer saved to {model_path} and {vectorizer_path}")
        except Exception as e:
            logger.error(f"Error saving model: {e}")
            raise
    
    def load_model(self, model_path: str, vectorizer_path: str):
        """
        Load trained model and vectorizer.
        
        Args:
            model_path: Path to load model from
            vectorizer_path: Path to load vectorizer from
        """
        try:
            with open(model_path, 'rb') as f:
                self.model = pickle.load(f)
            with open(vectorizer_path, 'rb') as f:
                self.vectorizer = pickle.load(f)
            logger.info("Model and vectorizer loaded successfully")
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise

def main():
    # Example usage
    analyzer = SentimentAnalyzer()
    
    # # Load processed data
    # df = pd.read_csv("data/processed/processed_reviews.csv")
    
    # # Prepare data
    # X, y = analyzer.prepare_data(df)
    
    # # Train model
    # metrics = analyzer.train_model(X, y, model_type='nb')
    
    # # Save model
    # analyzer.save_model(
    #     "models/sentiment_model.pkl",
    #     "models/vectorizer.pkl"
    # )
    
    # # Example prediction
    # sample_text = "This professor is amazing and very helpful!"
    # pred_class, pred_prob = analyzer.predict_sentiment(sample_text)
    # logger.info(f"Sample text: {sample_text}")
    # logger.info(f"Predicted sentiment: {'Positive' if pred_class == 1 else 'Negative'}")
    # logger.info(f"Confidence: {pred_prob:.4f}")

if __name__ == "__main__":
    main() 

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
import logging
from typing import Tuple, Dict, Any
import pickle
import joblib

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class GradePredictor:
    def __init__(self):
        self.model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.scaler = StandardScaler()
        self.base_features = ['rating', 'difficulty', 'sentiment_score']
        self.feature_columns = self.base_features.copy()
        self.professor_features = ['review_count', 'avg_rating', 'avg_difficulty']
        
    def calculate_sentiment_score(self, text):
        """Calculate sentiment score for a given text."""
        try:
            return TextBlob(str(text)).sentiment.polarity
        except:
            return 0.0
        
    def prepare_features(self, df: pd.DataFrame, is_training: bool = True):
        """
        Prepare features for model training or prediction.
        
        Args:
            df: Input DataFrame
            is_training: Whether this is for training (True) or prediction (False)
            
        Returns:
            Tuple of (X, y) where X is features DataFrame and y is target Series
        """
        # Create a copy to avoid modifying the original
        df_features = df.copy()
        
        # Calculate sentiment scores
        logger.info("Calculating sentiment scores...")
        df_features['sentiment_score'] = df_features['cleaned_text'].apply(self.calculate_sentiment_score)
        
        # Create professor-level features if professor_name is available
        if 'professor_name' in df_features.columns:
            prof_stats = df_features.groupby('professor_name').agg({
                'rating': ['count', 'mean'],
                'difficulty': 'mean'
            }).reset_index()
            prof_stats.columns = ['professor_name', 'review_count', 'avg_rating', 'avg_difficulty']
            
            # Merge back to main DataFrame
            df_features = df_features.merge(prof_stats, on='professor_name', how='left')
            
            # Add these to feature columns if in training mode
            if is_training:
                self.feature_columns = self.base_features + self.professor_features
        
        # Convert grade to numerical values if needed
        if 'grade' in df_features.columns:
            grade_mapping = {
                'A+': 4.0, 'A': 4.0, 'A-': 3.7,
                'B+': 3.3, 'B': 3.0, 'B-': 2.7,
                'C+': 2.3, 'C': 2.0, 'C-': 1.7,
                'D+': 1.3, 'D': 1.0, 'D-': 0.7,
                'F': 0.0
            }
            df_features['grade_numerical'] = df_features['grade'].map(grade_mapping)
        
        # Add missing professor-level features if needed
        for feature in self.feature_columns:
            if feature not in df_features.columns:
                df_features[feature] = 0.0  # Use default value for missing features
        
        # Prepare feature matrix
        X = df_features[self.feature_columns].copy()
        
        # Handle missing values
        X = X.fillna(X.mean())
        
        # Scale features
        if is_training:
            X_scaled = self.scaler.fit_transform(X)
        else:
            X_scaled = self.scaler.transform(X)
        
        X_scaled = pd.DataFrame(X_scaled, columns=self.feature_columns)
        
        if 'grade_numerical' in df_features.columns:
            y = df_features['grade_numerical'].fillna(df_features['grade_numerical'].mean())
            return X_scaled, y
        else:
            return X_scaled, None
    
    def train(self, train_df: pd.DataFrame):
        """
        Train the grade prediction model.
        
        Args:
            train_df: Training DataFrame
        """
        logger.info("Preparing features for training...")
        X, y = self.prepare_features(train_df, is_training=True)
        
        # Drop rows with missing target values
        mask = ~y.isna()
        X = X[mask]
        y = y[mask]
        
        logger.info("Training model...")
        self.model.fit(X, y)
        
        # Calculate training metrics
        train_pred = self.model.predict(X)
        train_rmse = np.sqrt(np.mean((y - train_pred) ** 2))
        logger.info(f"Training RMSE: {train_rmse:.3f}")
    
    def evaluate_model(self, test_df: pd.DataFrame) -> dict:
        """
        Evaluate model performance on test data.
        
        Args:
            test_df: Test DataFrame
            
        Returns:
            Dictionary containing evaluation metrics
        """
        X_test, y_test = self.prepare_features(test_df, is_training=False)
        
        # Drop rows with missing target values
        mask = ~y_test.isna()
        X_test = X_test[mask]
        y_test = y_test[mask]
        
        # Make predictions
        y_pred = self.model.predict(X_test)
        
        # Calculate metrics
        rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
        mae = np.mean(np.abs(y_test - y_pred))
        r2 = self.model.score(X_test, y_test)
        
        metrics = {
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }
        
        logger.info(f"Test RMSE: {rmse:.3f}")
        logger.info(f"Test MAE: {mae:.3f}")
        logger.info(f"Test R2: {r2:.3f}")
        
        return metrics
    
    def predict_grade(self, features_dict: dict) -> tuple:
        """
        Predict grade for new data.
        
        Args:
            features_dict: Dictionary containing feature values
            
        Returns:
            Tuple of (predicted_grade, confidence_interval)
        """
        # Create DataFrame from features
        df = pd.DataFrame([features_dict])
        
        # Prepare features
        X, _ = self.prepare_features(df, is_training=False)
        
        # Make prediction
        pred = self.model.predict(X)[0]
        
        # Calculate confidence interval using prediction std
        predictions = []
        for estimator in self.model.estimators_:
            predictions.append(estimator.predict(X)[0])
        
        ci_lower = np.percentile(predictions, 2.5)
        ci_upper = np.percentile(predictions, 97.5)
        
        # Convert numerical grade to letter grade
        grade_mapping = {
            4.0: 'A',
            3.7: 'A-',
            3.3: 'B+',
            3.0: 'B',
            2.7: 'B-',
            2.3: 'C+',
            2.0: 'C',
            1.7: 'C-',
            1.3: 'D+',
            1.0: 'D',
            0.7: 'D-',
            0.0: 'F'
        }
        
        # Find closest grade
        pred_letter = min(grade_mapping.items(), key=lambda x: abs(x[0] - pred))[1]
        ci_lower_letter = min(grade_mapping.items(), key=lambda x: abs(x[0] - ci_lower))[1]
        ci_upper_letter = min(grade_mapping.items(), key=lambda x: abs(x[0] - ci_upper))[1]
        
        return (pred_letter, (ci_lower_letter, ci_upper_letter)), (pred, (ci_lower, ci_upper))
    
    def save_model(self, model_path: str):
        """Save the trained model to disk."""
        joblib.dump((self.model, self.scaler, self.feature_columns), model_path)
        logger.info(f"Model saved to {model_path}")
    
    def load_model(self, model_path: str):
        """Load a trained model from disk."""
        self.model, self.scaler, self.feature_columns = joblib.load(model_path)
        logger.info(f"Model loaded from {model_path}")
            
    def get_feature_importance(self) -> pd.DataFrame:
        """
        Get feature importance scores from the model.
        
        Returns:
            DataFrame containing feature names and their importance scores
        """
        if not hasattr(self, 'model') or not hasattr(self.model, 'feature_importances_'):
            raise ValueError("Model not trained or does not support feature importance")
            
        importance_scores = self.model.feature_importances_
        feature_importance = pd.DataFrame({
            'feature': self.feature_columns,
            'importance': importance_scores
        })
        
        # Sort by importance in descending order
        feature_importance = feature_importance.sort_values('importance', ascending=False)
        
        return feature_importance

def main():
    # Example usage
    
    # Load processed data
    train_data = pd.read_csv("data/processed/processed_training_reviews.csv")
    test_data = pd.read_csv("data/processed/processed_test_reviews.csv")
    
    # # Initialize and train model
    # predictor = GradePredictor()
    # predictor.train(train_data)
    
    # # Evaluate model
    # metrics = predictor.evaluate_model(test_data)
    
    # # Save model
    # predictor.save_model("models/grade_predictor.pkl")
    
    # # Example prediction
    # sample_features = {
    #     'rating': 4.5,
    #     'difficulty': 3.0,
    #     'cleaned_text': "The professor was very helpful and explained concepts clearly."
    # }
    
    # (pred_letter, ci_letter), (pred_num, ci_num) = predictor.predict_grade(sample_features)
    # logger.info(f"Predicted grade (letter): {pred_letter}")
    # logger.info(f"95% Confidence Interval (letter): ({ci_letter[0]}, {ci_letter[1]})")
    # logger.info(f"Predicted grade (numerical): {pred_num:.2f}")
    # logger.info(f"95% Confidence Interval (numerical): ({ci_num[0]:.2f}, {ci_num[1]:.2f})")

if __name__ == "__main__":
    main() 

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Optional
import logging
import os

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataVisualizer:
    def __init__(self, output_dir: str = "visualizations"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        
        # Set style
        sns.set_style("whitegrid")
        sns.set_palette("husl")
    
    def plot_sentiment_distribution(self, df: pd.DataFrame, suffix: str = ""):
        """
        Plot distribution of sentiment scores.
        
        Args:
            df: DataFrame containing sentiment scores
            suffix: Suffix to add to output filename
        """
        plt.figure(figsize=(10, 6))
        sns.histplot(data=df, x='sentiment_score', bins=20)
        plt.title('Distribution of Sentiment Scores')
        plt.xlabel('Sentiment Score')
        plt.ylabel('Count')
        plt.savefig(f"{self.output_dir}/sentiment_distribution{suffix}.png")
        plt.close()
    
    def plot_grade_correlation(self, df: pd.DataFrame, suffix: str = ""):
        """
        Plot correlation between sentiment and grades.
        
        Args:
            df: DataFrame containing sentiment scores and grades
            suffix: Suffix to add to output filename
        """
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=df, x='sentiment_score', y='grade_numerical')
        plt.title('Correlation between Sentiment and Grades')
        plt.xlabel('Sentiment Score')
        plt.ylabel('Grade')
        plt.savefig(f"{self.output_dir}/grade_correlation{suffix}.png")
        plt.close()
    
    def plot_difficulty_rating(self, df: pd.DataFrame, suffix: str = ""):
        """
        Plot relationship between difficulty and rating.
        
        Args:
            df: DataFrame containing difficulty and rating scores
            suffix: Suffix to add to output filename
        """
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=df, x='difficulty', y='rating')
        plt.title('Relationship between Difficulty and Rating')
        plt.xlabel('Difficulty')
        plt.ylabel('Rating')
        plt.savefig(f"{self.output_dir}/difficulty_rating{suffix}.png")
        plt.close()
    
    def plot_department_comparison(self, df: pd.DataFrame, suffix: str = ""):
        """
        Plot comparison of ratings across departments.
        
        Args:
            df: DataFrame containing department and rating information
            suffix: Suffix to add to output filename
        """
        plt.figure(figsize=(12, 6))
        sns.boxplot(data=df, x='department', y='rating')
        plt.title('Rating Distribution by Department')
        plt.xlabel('Department')
        plt.ylabel('Rating')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/department_comparison{suffix}.png")
        plt.close()
    
    def plot_feature_importance(self, importance_df: pd.DataFrame, suffix: str = ""):
        """
        Plot feature importance scores.
        
        Args:
            importance_df: DataFrame containing feature importance scores
            suffix: Suffix to add to output filename
        """
        plt.figure(figsize=(10, 6))
        sns.barplot(data=importance_df, x='importance', y='feature')
        plt.title('Feature Importance for Grade Prediction')
        plt.xlabel('Importance Score')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/feature_importance{suffix}.png")
        plt.close()
    
    def plot_sentiment_trends(self, df: pd.DataFrame, suffix: str = ""):
        """
        Plot sentiment trends over time.
        
        Args:
            df: DataFrame containing sentiment scores and dates
            suffix: Suffix to add to output filename
        """
        # Convert date column to datetime if needed
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            df = df.sort_values('date')
            
            plt.figure(figsize=(12, 6))
            sns.lineplot(data=df, x='date', y='sentiment_score')
            plt.title('Sentiment Trends Over Time')
            plt.xlabel('Date')
            plt.ylabel('Average Sentiment Score')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f"{self.output_dir}/sentiment_trends{suffix}.png")
            plt.close()
    
    def create_correlation_matrix(self, df: pd.DataFrame, features: List[str], suffix: str = ""):
        """
        Create correlation matrix heatmap.
        
        Args:
            df: DataFrame containing features
            features: List of features to include in correlation matrix
            suffix: Suffix to add to output filename
        """
        plt.figure(figsize=(10, 8))
        corr_matrix = df[features].corr()
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Feature Correlation Matrix')
        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/correlation_matrix{suffix}.png")
        plt.close()
    
    def plot_model_comparison(self, train_metrics: Dict[str, float], test_metrics: Dict[str, float], 
                            model_name: str, suffix: str = ""):
        """
        Plot comparison of model performance on training and test data.
        
        Args:
            train_metrics: Dictionary of training metrics
            test_metrics: Dictionary of test metrics
            model_name: Name of the model
            suffix: Suffix to add to output filename
        """
        metrics = ['accuracy', 'f1_score', 'rmse', 'r2_score']
        train_values = [train_metrics.get(m, 0) for m in metrics]
        test_values = [test_metrics.get(m, 0) for m in metrics]
        
        x = np.arange(len(metrics))
        width = 0.35
        
        plt.figure(figsize=(12, 6))
        plt.bar(x - width/2, train_values, width, label='Training')
        plt.bar(x + width/2, test_values, width, label='Test')
        
        plt.xlabel('Metric')
        plt.ylabel('Score')
        plt.title(f'{model_name} Performance Comparison')
        plt.xticks(x, metrics)
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/model_comparison_{model_name}{suffix}.png")
        plt.close()
    
    def plot_actual_vs_predicted(self, y_true, y_pred, suffix=''):
        """
        Plot actual vs. predicted grades.
        
        Args:
            y_true (array-like): Actual grades.
            y_pred (array-like): Predicted grades.
            suffix (str): Suffix for the output filename.
        """
        plt.figure(figsize=(10, 6))
        plt.scatter(y_true, y_pred, alpha=0.5)
        plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
        plt.xlabel('Actual Grades')
        plt.ylabel('Predicted Grades')
        plt.title('Actual vs. Predicted Grades')
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, f'actual_vs_predicted{suffix}.png'))
        plt.close()

def main():
    # Example usage
    visualizer = DataVisualizer()
    
    # # Load processed data
    # training_df = pd.read_csv("data/processed/processed_training_reviews.csv")
    # test_df = pd.read_csv("data/processed/processed_test_reviews.csv")
    
    # # Create visualizations for training data
    # visualizer.plot_sentiment_distribution(training_df, suffix="_train")
    # visualizer.plot_grade_correlation(training_df, suffix="_train")
    # visualizer.plot_difficulty_rating(training_df, suffix="_train")
    # visualizer.plot_department_comparison(training_df, suffix="_train")
    
    # # Create visualizations for test data
    # visualizer.plot_sentiment_distribution(test_df, suffix="_test")
    # visualizer.plot_grade_correlation(test_df, suffix="_test")
    # visualizer.plot_difficulty_rating(test_df, suffix="_test")
    
    # # Load feature importance data
    # importance_df = pd.read_csv("models/feature_importance.csv")
    # visualizer.plot_feature_importance(importance_df)
    
    # # Create correlation matrices
    # features = ['rating', 'difficulty', 'sentiment_score', 'grade_numerical']
    # visualizer.create_correlation_matrix(training_df, features, suffix="_train")
    # visualizer.create_correlation_matrix(test_df, features, suffix="_test")
    
    # # Plot sentiment trends if date data is available
    # if 'date' in training_df.columns:
    #     visualizer.plot_sentiment_trends(training_df, suffix="_train")
    # if 'date' in test_df.columns:
    #     visualizer.plot_sentiment_trends(test_df, suffix="_test")

    # # After model evaluation
    # y_pred = model.predict(X_test_scaled)
    # visualizer.plot_actual_vs_predicted(y_test, y_pred, suffix="_test")

if __name__ == "__main__":
    main() 



this is the main function for to run the Random Forrest Run

In [11]:
import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# from data_collection import DataCollector
# from preprocessing import DataPreprocessor
# from src.models.sentiment_analysis import SentimentAnalyzer
# from src.models.predictive_modeling import GradePredictor
# from src.visualization.visualization import DataVisualizer

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def add_sentiment_scores(df: pd.DataFrame, sentiment_analyzer: SentimentAnalyzer) -> pd.DataFrame:
    """
    Add sentiment scores to the DataFrame.
    """
    # Get sentiment probabilities for each review
    X = sentiment_analyzer.vectorizer.transform(df['cleaned_text'])
    probabilities = sentiment_analyzer.model.predict_proba(X)
    df['sentiment_score'] = probabilities[:, 1]  # Probability of positive sentiment
    return df

def main():
    """
    Main function to run the entire analysis pipeline.
    """
    try:
        # Step 1: Data Collection
        logger.info("Starting data collection...")
        logger.info("Loading and splitting data...")
        collector = DataCollector()
        collector.load_and_split_data()
        
        # Step 2: Data Preprocessing
        logger.info("Starting data preprocessing...")
        preprocessor = DataPreprocessor()
        
        # Preprocess training data
        logger.info("Preprocessing training data...")
        train_df = preprocessor.load_data("data/raw/training_reviews.csv")
        processed_train_df = preprocessor.preprocess_data(train_df)
        preprocessor.save_processed_data(processed_train_df, "data/processed/processed_training_reviews.csv")
        
        # Preprocess test data
        logger.info("Preprocessing test data...")
        test_df = preprocessor.load_data("data/raw/test_reviews.csv")
        processed_test_df = preprocessor.preprocess_data(test_df)
        preprocessor.save_processed_data(processed_test_df, "data/processed/processed_test_reviews.csv")
        
        # Step 3: Sentiment Analysis
        logger.info("Starting sentiment analysis...")
        sentiment_analyzer = SentimentAnalyzer()
        
        # Train sentiment model
        logger.info("Training sentiment model...")
        X_train, y_train = sentiment_analyzer.prepare_data(processed_train_df)
        metrics = sentiment_analyzer.train_model(X_train, y_train, model_type='nb')
        sentiment_analyzer.save_model("models/sentiment_model.pkl", "models/vectorizer.pkl")
        
        # Evaluate sentiment model
        logger.info("Evaluating sentiment model on test data...")
        X_test, y_test = sentiment_analyzer.prepare_data(processed_test_df)
        test_metrics = sentiment_analyzer.evaluate_model(X_test, y_test)
        
        # Add sentiment scores to DataFrames
        logger.info("Adding sentiment scores to DataFrames...")
        processed_train_df = add_sentiment_scores(processed_train_df, sentiment_analyzer)
        processed_test_df = add_sentiment_scores(processed_test_df, sentiment_analyzer)
        
        # Step 4: Grade Prediction
        logger.info("Starting grade prediction modeling...")
        predictor = GradePredictor()
        
        # Train grade prediction model
        logger.info("Training grade prediction model...")
        predictor.train(processed_train_df)
        
        # Evaluate grade prediction model
        logger.info("Evaluating grade prediction model...")
        metrics = predictor.evaluate_model(processed_test_df)
        
        # Save the model
        predictor.save_model("models/grade_predictor.pkl")
        
        # Example prediction
        sample_features = {
            'rating': 4.5,
            'difficulty': 3.0,
            'cleaned_text': "The professor was very helpful and explained concepts clearly."
        }
        
        predicted_grade, confidence_interval = predictor.predict_grade(sample_features)
        logger.info(f"Example Prediction:")
        logger.info(f"Predicted grade: {predicted_grade}")
        logger.info(f"95% Confidence Interval: {confidence_interval}")
        
        # Save feature importance
        importance_df = predictor.get_feature_importance()
        importance_df.to_csv("models/feature_importance.csv", index=False)
        
        # Step 5: Visualization
        logger.info("Creating visualizations...")
        visualizer = DataVisualizer()
        
        # Visualize training data
        logger.info("Visualizing training data...")
        visualizer.plot_sentiment_distribution(processed_train_df)
        visualizer.plot_grade_correlation(processed_train_df)
        visualizer.plot_difficulty_rating(processed_train_df)
        visualizer.plot_department_comparison(processed_train_df)
        visualizer.plot_feature_importance(importance_df)
        
        # Visualize test data
        logger.info("Visualizing test data...")
        visualizer.plot_sentiment_distribution(processed_test_df, suffix="_test")
        visualizer.plot_grade_correlation(processed_test_df, suffix="_test")
        visualizer.plot_difficulty_rating(processed_test_df, suffix="_test")
        
        # Create correlation matrices
        features = ['rating', 'difficulty', 'sentiment_score', 'grade_numerical']
        visualizer.create_correlation_matrix(processed_train_df, features, suffix="_train")
        visualizer.create_correlation_matrix(processed_test_df, features, suffix="_test")
        
        if 'date' in processed_train_df.columns:
            visualizer.plot_sentiment_trends(processed_train_df, suffix="_train")
        if 'date' in processed_test_df.columns:
            visualizer.plot_sentiment_trends(processed_test_df, suffix="_test")
        
        # After model evaluation
        metrics = predictor.evaluate_model(processed_test_df)
        y_test = processed_test_df['grade_numerical']
        X_test_scaled, _ = predictor.prepare_features(processed_test_df, is_training=False)
        y_pred = predictor.model.predict(X_test_scaled)
        visualizer.plot_actual_vs_predicted(y_test, y_pred, suffix='_test')
        
        logger.info("Analysis pipeline completed successfully!")
        
    except Exception as e:
        logger.error(f"Error in analysis pipeline: {str(e)}", exc_info=True)

if __name__ == "__main__":
    main() 

INFO:__main__:Starting data collection...
INFO:__main__:Loading and splitting data...
INFO:__main__:Starting data preprocessing...
INFO:__main__:Preprocessing training data...
INFO:__main__:Successfully loaded data from data/raw/training_reviews.csv
INFO:__main__:Successfully saved processed data to data/processed/processed_training_reviews.csv
INFO:__main__:Preprocessing test data...
INFO:__main__:Successfully loaded data from data/raw/test_reviews.csv
INFO:__main__:Successfully saved processed data to data/processed/processed_test_reviews.csv
INFO:__main__:Starting sentiment analysis...
INFO:__main__:Training sentiment model...
INFO:__main__:Model: Naive Bayes
INFO:__main__:Accuracy: 0.7370
INFO:__main__:F1 Score: 0.8446
INFO:__main__:
Classification Report:
INFO:__main__:              precision    recall  f1-score   support

           0       0.92      0.08      0.14       153
           1       0.73      1.00      0.84       387

    accuracy                           0.74       5