In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
import random
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from tabulate import tabulate
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from termcolor import colored
import os
from typing import List, Dict, Union
os.environ["WANDB_DISABLED"] = "true"

In [None]:
directory_path = '/kaggle/input/iteration5-992tweets/Iterations - Iteration5.csv'

## Load the Dataset

In [None]:
# Load the dataset
df = pd.read_csv(directory_path)

In [None]:
df.shape

In [None]:
df["University"].nunique()

In [None]:
df["DEI"].value_counts()

## Check Class Distribution

In [None]:
# Count the occurrences of each label
label_counts = df['DEI'].value_counts()

# Create a bar plot
plt.figure(figsize=(8, 5))
sns.barplot(x=label_counts.index, y=label_counts.values, palette='viridis')

# Customize the plot
plt.title('Distribution of DEI Labels')
plt.xlabel('Labels')
plt.ylabel('Number of Tweets')
plt.xticks(ticks=[0, 1], labels=['Non-DEI (0)', 'DEI (1)'])
plt.ylim(0, max(label_counts.values) + 50)

# Show the plot
plt.show()

## Split the dataset into train, test and validation sets

In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
# Print the size of each dataset
print(f"Training set size: {train_df.shape[0]} samples")
print(f"Validation set size: {val_df.shape[0]} samples")
print(f"Test set size: {test_df.shape[0]} samples")

## Define path for Model and Tokenizer

In [None]:
# Load the BERTweet model and tokenizer
model_path = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)

## Tokenize the train, test and validation sets

In [None]:
# Tokenization function
def tokenize(df):
    return tokenizer(df['Tweet'].tolist(), padding='max_length', truncation=True, max_length=128, return_tensors='pt')

In [None]:
# Tokenize the train, validation, and test data
train_encoded = tokenize(train_df)
val_encoded = tokenize(val_df)
test_encoded = tokenize(test_df)

In [None]:
# Prepare the labels (ensure they are tensor-compatible with PyTorch)
train_labels = torch.tensor(train_df['DEI'].values)
val_labels = torch.tensor(val_df['DEI'].values)
test_labels = torch.tensor(test_df['DEI'].values)

## Load the Model

In [None]:
# Load the BERTweet model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)  # 2 classes (DEI vs non-DEI)

## Create a Custom Pytorch Dataset

In [None]:
# Define the Hugging Face Dataset class
class DEIDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Create the train, val, and test datasets
train_dataset = DEIDataset(train_encoded, train_labels)
val_dataset = DEIDataset(val_encoded, val_labels)
test_dataset = DEIDataset(test_encoded, test_labels)

## Set up the Training Parameters and Evaluation Metrics

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions

    # Convert 2D predictions to 1D by taking the argmax
    if len(preds.shape) > 1 and preds.shape[1] > 1:
        preds = np.argmax(preds, axis=1)

    # Ensure preds and labels are 1D arrays
    preds = preds.flatten()
    labels = labels.flatten()

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    auc_score = roc_auc_score(labels, preds)

    # Return as a dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc_score
    }

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=120,                # number of warmup steps for learning rate scheduler
    weight_decay=0.02,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    learning_rate = 5e-6,
    logging_steps=10,
    # Evaluate more frequently
    evaluation_strategy="epoch",     
    eval_steps=50,                   
    save_strategy="epoch",
    save_steps=50,
    load_best_model_at_end=True,      # load the best model when training is finished
     warmup_ratio=0.1,
    # Add gradient clipping to prevent exploding gradients
    max_grad_norm=1.0,
    # Add label smoothing to prevent overconfidence
    label_smoothing_factor=0.1,
    # Add metric for model selection
    metric_for_best_model="f1",
    greater_is_better=True
    

)

In [None]:
# Define the Trainer
trainer = Trainer(
    model=model,                         # the BERTweet model for sequence classification
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # validation dataset
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Add early stopping here


)

In [None]:
 trainer.train()

In [None]:
# Evaluate on the test set
results = trainer.evaluate(test_dataset)

In [None]:
print(results)

In [None]:
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test AUC Score: {results['eval_auc']:.4f}")

In [None]:
# Generate predictions on the test dataset
predictions = trainer.predict(test_dataset)

In [None]:
# Extract the predicted labels and true labels
predicted_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = test_labels.numpy()

In [None]:
cm = confusion_matrix(true_labels, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Non-DEI', 'DEI'],
            yticklabels=['Non-DEI', 'DEI'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

## Save the Model

In [None]:
# Save the fine-tuned model
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')

## Load and Prepare Unseen Dataset

In [None]:
directory_main = '/kaggle/input/tweets-main/Tweets'
directory_engg = '/kaggle/input/engineering-tweets-dataset/Output Data [Engineering]/Output Data [Engineering]'
directory_business = '/kaggle/input/tweets-business/Output Data [Business]-20241121T042244Z-001/Output Data [Business]'
directory_law = '/kaggle/input/tweets-law/Output Data [Law]'
directory_med = '/kaggle/input/tweets-med/Output Data [Med]'

In [None]:
def create_dataset(directory):
    
    all_df_list = []

    for filename in os.listdir(directory):
        df = pd.read_csv(os.path.join(directory, filename))
        all_df_list.append(df)

    return pd.concat(all_df_list, axis = 0, ignore_index=True)

In [None]:
tweets_main = create_dataset(directory_main)
tweets_engg = create_dataset(directory_engg)
tweets_business = create_dataset(directory_business)
tweets_law = create_dataset(directory_law)
tweets_med = create_dataset(directory_med)

In [None]:
unseen_dataset = pd.concat([tweets_main, tweets_engg, tweets_business, tweets_law, tweets_med], axis = 0, ignore_index = True)

In [None]:
unseen_dataset.shape

In [None]:
unseen_dataset.head()

## Load the Saved Model and Tokenizer

In [None]:
# Load the model and tokenizer
model_path = './fine_tuned_bert'
tokenizer = AutoTokenizer.from_pretrained(model_path)
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
class TweetTokenizer:

  def __init__(self, model_path: str, max_length: int = 128):
        """
        Initialize tokenizer with specific configurations

        Args:
            model_path (str): Path where model and tokenizer were saved
            max_length (int): Maximum sequence length for tokenization
        """
        # Load the pre-trained tokenizer from the saved path
        # This ensures we use the exact tokenizer configuration from fine-tuning
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)

        # Set maximum sequence length
        self.max_length = max_length

        # Determine optimal number of CPU cores for parallel processing
        self.num_cores = multiprocessing.cpu_count()

  def batch_tokenize(self, tweets: List[str]) -> Dict[str, torch.Tensor]:
    """
    Tokenize tweets in batches using multiprocessing

    Args:
    tweets (List[str]): List of tweet texts to tokenize

    Returns:
    Dict containing tokenized inputs
    """
    # Use multiprocessing to speed up tokenization
    with multiprocessing.Pool(self.num_cores) as pool:
      # Map tokenization across multiple CPU cores
      tokenized_inputs = pool.map(self._tokenize_single, tweets)

      # Convert list of dictionaries to tensors
      return self._convert_to_tensors(tokenized_inputs)



  def _tokenize_single(self, tweet: str) -> Dict[str, List[int]]:
    """
    Tokenize a single tweet

    Args:
    tweet (str): Single tweet text

    Returns:
    Tokenized representation of the tweet
    """
    # Perform tokenization with specific parameters
    encoding = self.tokenizer(
            tweet,
            truncation=True,  # Cut off sequences longer than max_length
            padding='max_length',  # Pad to max_length
            max_length=self.max_length,
            return_tensors=None  # Return as lists for multiprocessing
        )
    return encoding



  def _convert_to_tensors(self, tokenized_inputs: List[Dict]) -> Dict[str, torch.Tensor]:
    """
    Convert tokenized inputs to PyTorch tensors

    Args:
    tokenized_inputs (List[Dict]): List of tokenized tweet representations

    Returns:
    Dictionary of tensors
    """
    return {
        'input_ids': torch.tensor([x['input_ids'] for x in tokenized_inputs]),
        'attention_mask': torch.tensor([x['attention_mask'] for x in tokenized_inputs])
        }

In [None]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        """
        Create a PyTorch Dataset for tweets

        Args:
            dataframe (pd.DataFrame): DataFrame containing tweets
            tokenizer (TweetTokenizer): Custom tokenizer instance
        """
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Precompute tweet texts to avoid repeated DataFrame column access
        self.tweets = self.dataframe['Tweet'].tolist()

    def __len__(self) -> int:
        """
        Return total number of tweets in the dataset
        """
        return len(self.tweets)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Tokenize a single tweet

        Returns:
            Dict of tokenized inputs
        """
        # Tokenize single tweet
        encoding = self.tokenizer(
            self.tweets[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

In [None]:
def dataframe_chunk_generator(unseen_dataset, chunk_size=100000):
    """
    Generator that yields chunks of the DataFrame

    Args:
        unseen_dataset (pd.DataFrame): Concatenated DataFrame
        chunk_size (int): Number of rows per chunk

    Yields:
        DataFrame chunks
    """
    for i in range(0, len(unseen_dataset), chunk_size):
        yield unseen_dataset.iloc[i:i+chunk_size]

In [None]:
def process_large_dataset_generator(unseen_dataset, model, tokenizer,  output_dir, checkpoint_prefix, chunk_size, batch_size):
    """
    Process large dataset using generator-based approach
    
    Args:
        unseen_dataset (pd.DataFrame): Concatenated DataFrame
        model (torch.nn.Module): Trained model
        tokenizer (Tokenizer): Tokenization object
        chunk_size (int): Number of rows per chunk
        batch_size (int): Number of samples per batch
    
    Returns:
        List of all predictions
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # List to store all predictions
    all_predictions = []

    # Calculate total number of chunks
    total_chunks = (len(unseen_dataset) + chunk_size - 1) // chunk_size  

    # Use the generator to process chunks
    for chunk_index, chunk in enumerate(dataframe_chunk_generator(unseen_dataset, chunk_size), 1):

        # Define the checkpoint file path for the current chunk
        checkpoint_file = os.path.join(output_dir, f"{checkpoint_prefix}{chunk_index}.npy")

        # Skip if checkpoint already exists
        if os.path.exists(checkpoint_file):
            print(f"Skipping chunk {chunk_index}, already processed.")
            continue
            
        print(f"Processing chunk {chunk_index}")
        
        # Create dataset for current chunk
        dataset = TweetDataset(chunk, tokenizer)
        
        # Create DataLoader
        dataloader = DataLoader(
            dataset, 
            batch_size=batch_size,
            shuffle=False,
            num_workers=min(4, os.cpu_count()),
            pin_memory=torch.cuda.is_available()
        )
        
        # Inference for current chunk
        chunk_predictions = []
        with torch.no_grad():
            for batch in  tqdm(dataloader, desc=f"Chunk {chunk_index}/{total_chunks}", leave=False):
                # Move inputs to appropriate device
                input_ids = batch['input_ids'].to(model.device)
                attention_mask = batch['attention_mask'].to(model.device)
                
                # Model prediction
                outputs = model(
                    input_ids=input_ids, 
                    attention_mask=attention_mask
                )
                
                # Process predictions (adjust based on your model output)
                predictions = torch.softmax(outputs.logits, dim=1)
                
                # Convert to numpy and extend predictions
                chunk_predictions.extend(predictions.cpu().numpy())
                
                # Clear GPU memory
                del input_ids, attention_mask, outputs
                torch.cuda.empty_cache()

        
        # Accumulate chunk predictions
        all_predictions.extend(chunk_predictions)

        # Indicate that the current chunk has finished processing
        print(colored(f"Finished processing chunk {chunk_index}/{total_chunks}", "green"))
         
        # Save predictions for the current chunk
        np.save(checkpoint_file, chunk_predictions)
        print(f"\033[95mSaved predictions for chunk {chunk_index} to {checkpoint_file}\033[0m")

    print("\033[92mAll chunks processed successfully!\033[0m")
    
    return all_predictions

In [None]:
%%javascript
function keepNotebookAwake() {
    var element = document.querySelector('body');
    var event = new MouseEvent('mousemove', {
        'view': window,
        'bubbles': true,
        'cancelable': true
    });
    element.dispatchEvent(event);
}

// Run this every 5 minutes
setInterval(keepNotebookAwake, 300000);

In [None]:
# Process the entire dataset
predictions = process_large_dataset_generator(
    unseen_dataset, 
    model, 
    tokenizer,
    output_dir="checkpoints/",
    checkpoint_prefix="predictions_chunk_", 
    chunk_size=100000, 
    batch_size=64
)

In [None]:
binary_labels = []

for pred in predictions:
    # Label is 1 if probability of class 1 is higher than a threshold
    label = 1 if pred[1] >= 0.8 else 0
    binary_labels.append(label)

In [None]:
predicted_labels = []
confidence_scores = []
for pred in predictions:
    predicted_labels.append(np.argmax(pred))
    confidence_scores.append(np.max(pred))

In [None]:
unseen_dataset['Confidence_Score'] = confidence_scores

In [None]:
# Add predictions to the unseen dataframe
unseen_dataset['Predicted_DEI'] = predicted_labels

In [None]:
sample_250k = unseen_dataset.sample(n=250000, random_state = 42)
sample_250k.to_csv('sample_250k.csv')

In [None]:
sample_20k = unseen_dataset.sample(n=20000, random_state = 42)
sample_20k.to_csv('sample_20k.csv')

In [None]:
unseen_dataset.head()

In [None]:
unseen_dataset['Predicted_DEI'].value_counts()

In [None]:
# Save the predictions to a new CSV file
unseen_dataset.to_csv('/kaggle/working/predicted_unseen_data.csv', index=False)

In [None]:
# Load the unseen data with predictions
predicted_df = unseen_dataset.copy()

## Visualizations

## Trend of DEI Tweets over time

In [None]:
# Filter out non-date entries, including "0"
predicted_df = predicted_df[predicted_df["Date"].astype(str).str.isnumeric() == False]

In [None]:
predicted_df["Date"] = pd.to_datetime(predicted_df["Date"])
predicted_df["Year"] = predicted_df["Date"].dt.year

In [None]:
predicted_dei_tweets_by_year = predicted_df[predicted_df["Predicted_DEI"]==1].groupby("Year").size().reset_index(name="DEI_Tweet_Count")

In [None]:
# Create a DataFrame with all years from 2010 to 2022
years = pd.DataFrame({'Year': list(range(2010, 2023))})

# Merge with the original data (predicted_dei_tweets_by_year) to ensure all years are present
predicted_dei_tweets_by_year = years.merge(predicted_dei_tweets_by_year, on='Year', how='left')

# Fill missing DEI tweet counts with 0
predicted_dei_tweets_by_year['DEI_Tweet_Count'].fillna(0, inplace=True)

# Plotting the line chart
plt.figure(figsize=(9, 5))  # Set figure size
plt.plot(predicted_dei_tweets_by_year['Year'], 
         predicted_dei_tweets_by_year['DEI_Tweet_Count'], 
         marker='o',  # Add markers at each point
         label='DEI Tweets Count')

# Customize layout (axis labels, title, etc.)
plt.title('Predicted DEI-Related Tweets Over Time', fontsize=14, loc='center')  # Center the title
plt.xlabel('Year', fontsize=12)
plt.ylabel('DEI Tweets Count', fontsize=12)
plt.grid(visible=True, linestyle='--', alpha=0.7)  # Add a grid for better readability
plt.xticks(predicted_dei_tweets_by_year['Year'], rotation=45)  # Rotate x-axis labels if needed
plt.tight_layout()  # Adjust layout to prevent clipping
plt.legend()  # Add legend
plt.show()


In [None]:
predicted_dei_tweets_by_year

In [None]:
hbcu_list = ['Florida A&M University',
'North Carolina Agricultural & Technical State University'
'Howard University',
'Spelman College',
'Morgan State University']

In [None]:
uni_df = predicted_df[(predicted_df["Predicted_DEI"]==1) & (predicted_df["University"]=="Florida A&M University")]
uni_df['Tweet'].sample(n=10).tolist()

In [None]:
uni_df.shape

In [None]:
# List of selected universities
selected_universities = ['Harvard University', 'Yale University', 'Columbia University', 'Massachusetts Institute of Technology (MIT)', 'Stanford University']  # Add your universities here

# Filter data for selected universities and DEI tweets
filtered_df = predicted_df[(predicted_df['University'].isin(selected_universities)) &
                           (predicted_df['Predicted_DEI'] == 1)]

# Group by university and count DEI tweets
dei_tweet_counts = filtered_df.groupby('University').size().reset_index(name='DEI_Tweet_Count')

# Print each university name and its DEI tweet count
for index, row in dei_tweet_counts.iterrows():
    print(f"University: {row['University']}, DEI Tweet Count: {row['DEI_Tweet_Count']}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Filter for DEI tweets only
dei_tweets = predicted_df[predicted_df['Predicted_DEI'] == 1]

# Get top universities by total DEI tweet count
top_unis = (dei_tweets.groupby('University')
            .size()
            .nlargest(5)
            .index
            .tolist())

# Filter data for top universities
top_unis_df = dei_tweets[dei_tweets['University'].isin(top_unis)]

# Calculate yearly DEI tweets for each university
yearly_dei = (top_unis_df.groupby(['Year', 'University'])
              .size()
              .unstack(fill_value=0)  # Create a wide format with years as index and universities as columns
              .reset_index())

# Set 'Year' as index for the stacked bar chart
yearly_dei.set_index('Year', inplace=True)

# Create a stacked bar plot
plt.figure(figsize=(12, 6))
yearly_dei.plot(kind='bar', stacked=True, cmap='tab10', ax=plt.gca())

# Customize the plot
plt.title('Top 5 Universities DEI Tweet Counts Over Years', size=16)
plt.xlabel('Year', size=12)
plt.ylabel('Number of DEI Tweets', size=12)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend(title='University', bbox_to_anchor=(1.05, 1), loc='upper left')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Filter for DEI tweets only
dei_tweets = predicted_df[predicted_df['Predicted_DEI'] == 1]

# Get top universities by total DEI tweet count
top_unis = (dei_tweets.groupby('University')
            .size()
            .nlargest(10)
            .index
            .tolist())

# Filter data for top universities
top_unis_df = dei_tweets[dei_tweets['University'].isin(top_unis)]

# Calculate yearly DEI tweets for each university
yearly_dei = (top_unis_df.groupby(['Year', 'University'])
              .size()
              .unstack(fill_value=0))  # Create a wide format with years as index and universities as columns

# Create a stacked bar chart
fig = go.Figure()

# Add a bar for each university
for uni in yearly_dei.columns:
    fig.add_trace(go.Bar(
        x=yearly_dei.index,
        y=yearly_dei[uni],
        name=uni,
        hoverinfo='y+name',
        marker=dict(line=dict(width=0))
    ))

# Update layout for stacked bar chart
fig.update_layout(
    title='Top 5 Universities DEI Tweet Counts Over Years',
    xaxis_title='Year',
    yaxis_title='Number of DEI Tweets',
    barmode='stack',
    legend_title='University'
)

# Show the plot
fig.show()

### DEI Trend Over the Years in HBCUs

In [None]:
# List of selected universities
selected_universities = ['Florida A&M University', 'Howard University', 'Spelman College',
                         'Morgan State University', 'North Carolina Agricultural and Technical State University']

# Filter data for selected universities and DEI tweets
filtered_df = predicted_df[(predicted_df['University'].isin(selected_universities)) &
                           (predicted_df['Predicted_DEI'] == 1)]

# Group by university and count DEI tweets
dei_tweet_counts = filtered_df.groupby('University').size().reset_index(name='DEI_Tweet_Count')

# Print each university name and its DEI tweet count
for index, row in dei_tweet_counts.iterrows():
    print(f"University: {row['University']}, DEI Tweet Count: {row['DEI_Tweet_Count']}")

### Percentage of DEI Tweet Counts in HBCUs

In [None]:
# Filter data for top universities
top_unis_df = dei_tweets[dei_tweets['University'].isin(selected_universities)]

# Calculate yearly DEI tweets for each university
yearly_dei = (top_unis_df.groupby(['Year', 'University'])
              .size()
              .unstack(fill_value=0)  # Create a wide format with years as index and universities as columns
              .reset_index())

# Calculate yearly DEI tweets for each university
yearly_dei = (top_unis_df.groupby(['Year', 'University'])
              .size()
              .unstack(fill_value=0))  # Create a wide format with years as index and universities as columns

# Create a stacked bar chart
fig = go.Figure()

# Add a bar for each university
for uni in yearly_dei.columns:
    fig.add_trace(go.Bar(
        x=yearly_dei.index,
        y=yearly_dei[uni],
        name=uni,
        hoverinfo='y+name',
        marker=dict(line=dict(width=0))
    ))

# Update layout for stacked bar chart
fig.update_layout(
    title='HBCU DEI Tweet Counts Over Years',
    xaxis_title='Year',
    yaxis_title='Number of DEI Tweets',
    barmode='stack',
    legend_title='University'
)

# Show the plot
fig.show()

In [None]:
grouped_df = predicted_df.groupby(['University', 'Year'])['Predicted_DEI'].sum().reset_index()

# Step 2: Get the top 5 universities with the highest total DEI tweet counts
top_unis = grouped_df.groupby('University')['Predicted_DEI'].sum().nlargest(5).index

# Step 3: Filter the DataFrame for only those top 5 universities
filtered_df = grouped_df[grouped_df['University'].isin(top_unis)]

In [None]:
fig = px.bar(filtered_df, x='Year', y='Predicted_DEI', color='University',
             barmode='group',
             labels={'DEI': 'Count of DEI Tweets'},
             title='Predicted Top 5 Universities DEI Tweet Counts by Year')

# Show the plot
fig.show()

### Percentage of DEI tweets in labeled dataset

In [None]:
df['Year'] = df['Date'].dt.year

# Step 2: Group by year and count total tweets and DEI-related tweets
yearly_counts = df.groupby('Year').agg(
    Total_Tweets=('DEI', 'count'),
    DEI_Tweets=('DEI', 'sum')
).reset_index()

# Step 3: Calculate percentage of DEI-related tweets
yearly_counts['DEI_Percentage'] = (yearly_counts['DEI_Tweets'] / yearly_counts['Total_Tweets']) * 100

# Step 4: Plot the results
plt.figure(figsize=(10, 6))
sns.barplot(x='Year', y='DEI_Percentage', data=yearly_counts, palette='viridis')

plt.title('Percentage of DEI-Related Tweets by Year', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Percentage of DEI Tweets (%)', fontsize=14)
plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
plt.tight_layout()

# Save the figure if needed
plt.savefig('/kaggle/working/DEI_Percentage_by_Year.png', dpi=300)

# Show the plot
plt.show()

### Percentage of Predicted DEI Tweets over the Year

In [None]:
predicted_df['Year'] = predicted_df['Date'].dt.year

# Step 2: Group by year and count total tweets and DEI-related tweets
yearly_counts = predicted_df.groupby('Year').agg(
    Total_Tweets=('Predicted_DEI', 'count'),
    DEI_Tweets=('Predicted_DEI', 'sum')
).reset_index()

# Step 3: Calculate percentage of DEI-related tweets
yearly_counts['DEI_Percentage'] = (yearly_counts['DEI_Tweets'] / yearly_counts['Total_Tweets']) * 100

# Step 4: Plot the results
plt.figure(figsize=(10, 6))
sns.barplot(x='Year', y='DEI_Percentage', data=yearly_counts, palette='viridis')

plt.title('Percentage of Predicted DEI-Related Tweets by Year', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Percentage of DEI Tweets (%)', fontsize=14)
plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
plt.tight_layout()

# Save the figure if needed
plt.savefig('/kaggle/working/DEI_Percentage_by_Year.png', dpi=300)

# Show the plot
plt.show()

In [None]:
import plotly.express as px

# Create a line chart using Plotly
fig = px.line(yearly_counts,
              x='Year',
              y='DEI_Percentage',
              markers=True,  # Show markers at data points
              title='Percentage of Predicted DEI-Related Tweets by Year')

# Customize the layout
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Percentage of DEI Tweets (%)',
    title_x=0.5,  # Center the title
    width=900,  # Set figure size
    height=600
)

# Rotate x-ticks for better visibility
fig.update_xaxes(tickangle=45)

# Show the figure
fig.show()

# Optional: Save the figure (using plotly's write_image function if needed)
# fig.write_image('/kaggle/working/DEI_Percentage_by_Year_LineChart.png')

In [None]:
import plotly.graph_objects as go

# Create traces for DEI Tweet Count and DEI Percentage
trace1 = go.Scatter(
    x=predicted_dei_tweets_by_year['Year'],
    y=predicted_dei_tweets_by_year['DEI_Tweet_Count'],
    mode='lines+markers',
    name='DEI Tweet Count',
    line=dict(color='blue', width=2),
    marker=dict(color='blue')
)

trace2 = go.Scatter(
    x=yearly_counts['Year'],
    y=yearly_counts['DEI_Percentage'],
    mode='lines+markers',
    name='DEI Percentage (%)',
    line=dict(color='green', width=2, dash='dash'),
    marker=dict(color='green'),
    yaxis='y2'  # Specify secondary y-axis
)

# Create the layout with dual y-axes
layout = go.Layout(
    title='Predicted DEI-Related Tweet Counts and Percentage Over Time',
    xaxis=dict(title='Year', title_font=dict(size=14)),
    yaxis=dict(
        title='DEI Tweet Count',
        titlefont=dict(color='blue', size=14),
        tickfont=dict(color='blue')
    ),
    yaxis2=dict(
        title='DEI Percentage (%)',
        titlefont=dict(color='green', size=14),
        tickfont=dict(color='green'),
        overlaying='y',
        side='right'
    ),
    width=800,
    height=500,
    legend=dict(
        orientation="h",  # Horizontal orientation
        yanchor="bottom",
        y=1.03,  # Position above plot
        xanchor="center",
        x=0.5
    ),

)

# Create the figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

# Show the plot
fig.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your DataFrame is already created and named df

# Step 1: Extract the year from the Date column
df['Year'] = df['Date'].dt.year

# Step 2: Group by year and count total tweets and DEI-related tweets
yearly_counts = df.groupby('Year').agg(
    Total_Tweets=('DEI', 'count'),
    DEI_Tweets=('DEI', 'sum')
).reset_index()

# Step 3: Melt the DataFrame to get it in the long format for plotting
yearly_counts_melted = yearly_counts.melt(id_vars='Year',
                                           value_vars=['Total_Tweets', 'DEI_Tweets'],
                                           var_name='Tweet_Type',
                                           value_name='Count')

# Step 4: Plot the results
plt.figure(figsize=(12, 6))
sns.barplot(x='Year', y='Count', hue='Tweet_Type', data=yearly_counts_melted, palette='viridis')

plt.title('Total Tweets and DEI-Related Tweets by Year', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Tweets', fontsize=14)
plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
plt.legend(title='Tweet Type')
plt.tight_layout()

# Save the figure if needed
plt.savefig('/kaggle/working/Total_and_DEI_Tweets_by_Year.png', dpi=300)

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your DataFrame is already created and named df

# Step 1: Extract the year from the Date column
predicted_df['Year'] = predicted_df['Date'].dt.year

# Step 2: Group by year and count total tweets and DEI-related tweets
yearly_counts = predicted_df.groupby('Year').agg(
    Total_Tweets=('Predicted_DEI', 'count'),
    DEI_Tweets=('Predicted_DEI', 'sum')
).reset_index()

# Step 3: Melt the DataFrame to get it in the long format for plotting
yearly_counts_melted = yearly_counts.melt(id_vars='Year',
                                           value_vars=['Total_Tweets', 'DEI_Tweets'],
                                           var_name='Tweet_Type',
                                           value_name='Count')

# Step 4: Plot the results
plt.figure(figsize=(12, 6))
sns.barplot(x='Year', y='Count', hue='Tweet_Type', data=yearly_counts_melted, palette='viridis')

plt.title('Total Unseen Tweets and Predicted DEI-Related Tweets by Year', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Tweets', fontsize=14)
plt.xticks(rotation=45)  # Rotate x-ticks for better visibility
plt.legend(title='Tweet Type')
plt.tight_layout()

# Save the figure if needed
plt.savefig('/kaggle/working/Total_and_DEI_Tweets_by_Year.png', dpi=300)

# Show the plot
plt.show()

## TF-IDF Analysis

In [None]:
predicted_df.head()

In [None]:
hbcu_list = ['Florida A&M University',
'North Carolina Agricultural & Technical State University'
'Howard University',
'Spelman College',
'Morgan State University']

In [None]:
# predicted_dei_tweets = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['University'].isin(hbcu_list))]
predicted_dei_tweets = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['Year']==2019)]

predicted_dei_tweets.head()

### Preprocess Tweets

In [None]:
# Download stopwords and wordnet once if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# Define the preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove punctuation and numbers
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    # Tokenize, remove stopwords, and lemmatize
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a single string
    return ' '.join(tokens)

In [None]:
# Preprocess each DEI tweet in the 'Tweet' column
predicted_dei_tweets['Processed_Tweet'] = predicted_dei_tweets['Tweet'].apply(preprocess_text)

In [None]:
dei_keywords = [
    "Diversity", "Equity", "Equality", "Inclusion", "Inclusive", "Fairness", "Justice",
    "Representation", "Bias", "Privilege", "Discrimination", "Intersectionality",
    "Accessibility", "Allyship", "Belonging", "Cultural competence", "Social justice",
    "Equal pay", "Anti-racism", "Marginalization", "Oppression", "Civil rights",
    "Gender equality", "Racial equality", "LGBTQ+", "LGBTQ+", "LGBTQ",
    "Disability rights", "Neurodiversity", "Multiculturalism", "Safe space",
    "Inclusive language", "Unconscious bias", "Microaggressions", "Cultural sensitivity",
    "Advocacy", "Veteran", "Women", "Men", "Girl", "Boy", "Gender",
    "Female", "Male", "Non-binary", "Transgender", "Queer", "Black",
    "African American", "Asian", "Latino", "Hispanic", "Indigenous",
    "Native American", "Pacific Islander", "White", "Person of Color", "BIPOC",
    "Disabled", "Neurodiverse", "Immigrant", "Refugee", "Minority",
    "Underrepresented", "Migrant", "Ethnicity", "Race", "Sexual orientation",
    "Sexual", "Religion", "Christian", "Muslim", "Jewish", "Buddhist",
    "Hindu", "Interfaith", "Faith-based", "Cultural background",
    "Ethnic diversity", "Ethnic", "Ageism", "Sexism", "Racism",
    "Homophobia", "Transphobia", "Xenophobia", "Islamophobia", "Ableism",
    "Sexual harassment", "Workplace diversity", "Gender identity", "Poverty",
    "Atheism", "Mental health", "Mental", "Multicultural", "Diverse",
    "Interracial", "Equal", "Equalizing", "Equalized", "Justifying",
    "Justified", "Nationality", "National", "Nationalized", "Nationalizing",
    "Heritage", "Ancestry", "LGBTQ+", "Heterosexual", "Heterosexuality",
    "Homosexual", "Homosexuality", "Bisexual", "Bisexuality", "Pansexual",
    "Pansexuality", "Asexual", "Asexuality", "Cisgender", "Genderqueer",
    "Genderfluid", "Agender", "Faith", "Belief", "Beliefs",
    "Spirituality", "Spiritual", "Spiritualizing", "Spiritualized",
    "Bilingual", "Bilingualism", "Multilingual", "Multilingualism",
    "Age", "Aging", "Aged", "Generational", "Elderly",
    "Partnership", "Partnering", "Partnered", "Military", "Veteran",
    "Veteranized", "Cognitive", "Cognition", "Neurodiversity", "Neurodiverse",
    "Disability", "Disabled", "Disabling", "Mobility", "Mobile",
    "Mobilizing", "Mobilized", "Accessibility", "Accessible", "Inclusive",
    "Tolerance", "Tolerating", "Tolerant", "Worldview", "Identity",
    "Identified", "Identify", "Awareness", "Aware", "Representation",
    "Representing", "Represented", "Representative", "Fairness",
    "Fair", "Fairing", "Equity", "Equitable", "Equitably",
    "Impartiality", "Impartial", "Disparity", "Disparate",
    "Disparaging", "Disparaged", "Barrier", "Barring", "Access",
    "Accessing", "Biasing", "Biased", "Biasness", "Socializing",
    "Socialized", "Empowerment", "Empowering", "Empowered",
    "Belonging", "Belong", "Belonged", "Inclusion", "Included",
    "Including", "Christianity", "Christian", "Christianized",
    "Christianizing", "Islam", "Islamic", "Islamized",
    "Islamizing", "Judaism", "Jewish", "Buddhism", "Buddhist",
    "Hinduism", "Hindu", "Atheism", "Atheist", "Agnosticism",
    "Agnostic", "Global", "Minor", "Emotion", "Emotional",
    "Cross-culture", "Cross culture", "Cross-cultural", "Anti-bias",
    "#Melanin", "Black Futures Month", "Black Men", "Black Panther Movement",
    "Racism", "Antiracism", "racial profiling", "community freedom",
    "people of color", "racial justice", "race relations",
    "White Supremacy", "Anti Black", "BLM", "Black Lives Matter", "Black Kids Matter", "George Floyd","black lives matter", "#blm", "no justice no peace", "say their names",
    "i can't breathe", "hands up don't shoot", "#blackhistorymonth",
    "#nomorenames", "#endpolicebrutality", "#defundthepolice", "acab",
    "#wewillbreathe", "#theshowmustbepaused", "#blackouttuesday",
    "power to the people", "rest in power", "#racismisavirus",
    "#blackexcellence", "#blackjoy", "#supportblackbusiness",
    "#buyblack", "#blackownedbusiness", "#communityfirst",
    "#powertothepeople", "#buildingcommunity", "#strongertogether",
    "#blackexcellence", "#blacksuccess", "say their name",
    "never forget", "#justiceforbreonnataylor", "#remembertheirnames",
    "#sayhername", "#takeaknee", "#abolishthepolice",
    "#wewillnotforget", "#foreverinourhearts", "#amplifyblackvoices",
    "#neveragain", "#blmprotests", "legacy lives on", "#juneteenth",
    "#blackintech", "#blacktwitter", "#blackgirlscode",
    "#blackentrepreneurs", "#melanin", "black futures month",
    "black men", "black panther movement", "black", "racial", "racism", "antiracism",
    "racial profiling", "community freedom", "people of color",
    "racial justice", "race relations", "#shareblackstories",
    "#amplifymelanatedvoices", "#passthemic", "#raiseyourvoice",
    "#speakup", "#useyourplatform", "#aminext", "#sharethemic",
    "#dosomething", "#takeaction", "#makenoise", "#disruptthenarrative",
    "#wecannotbesilent", "#thisstopstoday", "#notonemore",
    "freedom now", "white silence is violence", "black power",
    "anti black racism", "black scientists", "black owned",
    "black kids matter", "now we transform", "white supremacy",
    "#blackqueerlives","blackhistorymonth", "george floyd", "systemic racism", "hbcu", "police brutality"
]

In [None]:
hbcu_list = ['Florida A&M University',
'North Carolina Agricultural & Technical State University'
'Howard University',
'Spelman University',
'Morgan State University']

In [None]:
# Convert DEI keywords to lowercase
dei_keywords_lower = [kw.lower() for kw in dei_keywords]

In [None]:
def get_top_dei_keywords(df, keywords, top_n):

    df['Processed_Tweet'] = df['Tweet'].apply(preprocess_text)

    # Initialize the TF-IDF vectorizer
    vectorizer = TfidfVectorizer(ngram_range=(1, 3))

    # Fit and transform the preprocessed tweets
    tfidf_matrix = vectorizer.fit_transform(df['Processed_Tweet'])

    # Get feature names (terms) and their corresponding scores
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.sum(axis=0).A1  # Sum scores across all tweets for each term

    # Create a DataFrame for TF-IDF scores
    tfidf_df = pd.DataFrame({
        'Keyword': feature_names,
        'TF-IDF Score': tfidf_scores
    })

    # Filter for keywords related to DEI
    dei_tfidf_df = tfidf_df[tfidf_df['Keyword'].isin(keywords)]

    # Sort by TF-IDF score in descending order
    dei_tfidf_df = dei_tfidf_df.sort_values(by='TF-IDF Score', ascending=False)

    # Return the top N keywords
    return dei_tfidf_df.head(top_n)



In [None]:
predicted_dei = predicted_df[predicted_df['Predicted_DEI'] == 1]
predicted_hbcu = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['University'].isin(hbcu_list))]
predicted_non_hbcu = predicted_df[(predicted_df['Predicted_DEI']==1) & (~predicted_df['University'].isin(hbcu_list))]
predicted_2015 = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['Year']==2015)]
predicted_2016 = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['Year']==2016)]
predicted_2017 = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['Year']==2017)]
predicted_2018 = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['Year']==2018)]
predicted_2019 = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['Year']==2019)]
predicted_2020 = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['Year']==2020)]
predicted_2021 = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['Year']==2021)]
predicted_2022 = predicted_df[(predicted_df['Predicted_DEI']==1) & (predicted_df['Year']==2022)]


In [None]:
top_n = 20

In [None]:
top_dei_keywords = get_top_dei_keywords(predicted_dei,  dei_keywords_lower, top_n)
top_keywords_2019 = get_top_dei_keywords(predicted_2019, dei_keywords_lower, top_n)
top_keywords_2020 = get_top_dei_keywords(predicted_2020, dei_keywords_lower, top_n)
top_keywords_2021 = get_top_dei_keywords(predicted_2021, dei_keywords_lower, top_n)
top_keywords_2022 = get_top_dei_keywords(predicted_2022, dei_keywords_lower, top_n)
top_keywords_hbcu = get_top_dei_keywords(predicted_hbcu, dei_keywords_lower, top_n)
top_keywords_non_hbcu = get_top_dei_keywords(predicted_non_hbcu, dei_keywords_lower, top_n)

In [None]:
hbcu_list = ['Florida A&M University',
'North Carolina Agricultural & Technical State University'
'Howard University',
'Spelman University',
'Morgan State University']

In [None]:
# Set up the subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
axes = axes.flatten()  # Flatten the 2D array of axes to easily iterate over

# List of years and their corresponding data
years = [2015, 2016, 2017, 2018]
top_keywords_list = [top_keywords_2019, top_keywords_2020, top_keywords_2021, top_keywords_2022]

# Create a horizontal bar chart for each year
for ax, year, top_keywords in zip(axes, years, top_keywords_list):
    # Ensure only the top 10 keywords are plotted
    top_keywords = top_keywords.head(10)

    ax.barh(top_keywords['Keyword'], top_keywords['TF-IDF Score'], color='skyblue')
    ax.set_title(f"Top 10 DEI Keywords - {year}")
    ax.set_xlabel("TF-IDF Score")
    ax.set_ylabel("Keywords")
    ax.invert_yaxis()  # Invert y-axis to show the highest score on top

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Set up the subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
axes = axes.flatten()  # Flatten the 2D array of axes to easily iterate over

# List of years and their corresponding data
years = [2019, 2020, 2021, 2022]
top_keywords_list = [top_keywords_2019, top_keywords_2020, top_keywords_2021, top_keywords_2022]

# Create a horizontal bar chart for each year
for ax, year, top_keywords in zip(axes, years, top_keywords_list):
    # Ensure only the top 10 keywords are plotted
    top_keywords = top_keywords.head(10)

    ax.barh(top_keywords['Keyword'], top_keywords['TF-IDF Score'], color='skyblue')
    ax.set_title(f"Top 10 DEI Keywords - {year}")
    ax.set_xlabel("TF-IDF Score")
    ax.set_ylabel("Keywords")
    ax.invert_yaxis()  # Invert y-axis to show the highest score on top

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
## Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(14, 6))

# Bar chart for HBCUs
axs[0].barh(top_keywords_hbcu['Keyword'], top_keywords_hbcu['TF-IDF Score'], color='skyblue')
axs[0].set_xlabel('TF-IDF Score')
axs[0].set_title('Top DEI Keywords from HBCU Tweets')
axs[0].invert_yaxis()  # Invert y-axis

# Bar chart for Non-HBCUs
axs[1].barh(top_keywords_non_hbcu['Keyword'], top_keywords_non_hbcu['TF-IDF Score'], color='lightcoral')
axs[1].set_xlabel('TF-IDF Score')
axs[1].set_title('Top DEI Keywords from Non-HBCU Tweets')
axs[1].invert_yaxis()  # Invert y-axis

# Adjust layout
plt.tight_layout()
plt.show()

### Contribution of Universities in DEI Conversation

In [None]:
top_and_hbcu = [
    "New York University",
    "University of California, Davis",
    "University of Michigan--Ann Arbor",
    "American University",
    "Johns Hopkins University",
    "The Pennsylvania State University",
    "Stanford University",
    "Florida A&M University",
    "North Carolina Agricultural & Technical State University",
    "Howard University",
    "Spelman College",
    "Morgan State University",
    "Georgetown University",
    "Simmons University",
    "Georgia State University"
]

In [None]:
# Filter the DataFrame for the selected universities
filtered_df = predicted_df[predicted_df['University'].isin(top_and_hbcu)]

In [None]:
# Calculate the total DEI tweets for each university
dei_counts = filtered_df.groupby('University')['Predicted_DEI'].sum().reset_index()
dei_counts.columns = ['University', 'DEI_Tweet_Count']

In [None]:
total_dei_tweets = predicted_df['Predicted_DEI'].sum()
dei_counts['Contribution (%)'] = (dei_counts['DEI_Tweet_Count'] / total_dei_tweets) * 100

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming predicted_df is your DataFrame
top_dei_unis = [
    "New York University",
    "University of California, Davis",
    "University of Michigan--Ann Arbor",
    "American University",
    "Johns Hopkins University",
    "The Pennsylvania State University",
    "Stanford University",
    "Georgetown University",
    "Simmons University",
    "Georgia State University"
]

hbcu_list = [
    "Florida A&M University",
    "North Carolina Agricultural & Technical State University",
    "Howard University",
    "Spelman College",
    "Morgan State University"
]

# Calculate DEI contributions for top DEI universities
filtered_dei = predicted_df[predicted_df['University'].isin(top_dei_unis)]
dei_counts_dei_unis = filtered_dei.groupby('University')['Predicted_DEI'].sum().reset_index()
total_dei_tweets = predicted_df['Predicted_DEI'].sum()
dei_counts_dei_unis['Contribution (%)'] = (dei_counts_dei_unis['Predicted_DEI'] / total_dei_tweets) * 100

# Sort the contributions in descending order
dei_counts_dei_unis = dei_counts_dei_unis.sort_values(by='Contribution (%)', ascending=False)

# Calculate DEI contributions for HBCUs
filtered_hbcus = predicted_df[predicted_df['University'].isin(hbcu_list)]
dei_counts_hbcus = filtered_hbcus.groupby('University')['Predicted_DEI'].sum().reset_index()
dei_counts_hbcus['Contribution (%)'] = (dei_counts_hbcus['Predicted_DEI'] / total_dei_tweets) * 100

# Sort the contributions in descending order
dei_counts_hbcus = dei_counts_hbcus.sort_values(by='Contribution (%)', ascending=False)

# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

# Bar plot for top DEI universities
sns.barplot(data=dei_counts_dei_unis, x='Contribution (%)', y='University', ax=axes[0], palette='viridis')
axes[0].set_title('Contribution of Top DEI Universities to the Overall DEI Conversation')
axes[0].set_xlabel('Contribution (%)')
axes[0].set_ylabel('University')

# Bar plot for HBCUs
sns.barplot(data=dei_counts_hbcus, x='Contribution (%)', y='University', ax=axes[1], palette='viridis')
axes[1].set_title('Contribution of HBCUs to the Overall DEI Conversation')
axes[1].set_xlabel('Contribution (%)')
axes[1].set_ylabel('University')

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
import pandas as pd
import plotly.express as px

# Assuming predicted_df is your DataFrame and it has a 'Year' column
predicted_df['Year'] = predicted_df['Year'].astype(str)  # Ensure Year is a string for plotting

# Top 5 DEI universities
top_dei_unis = [
    "New York University",
    "University of California, Davis",
    "University of Michigan--Ann Arbor",
    "American University",
    "Johns Hopkins University"
]

# HBCU list
hbcu_list = [
    "Florida A&M University",
    "North Carolina Agricultural & Technical State University",
    "Howard University",
    "Spelman College",
    "Morgan State University"
]

# Calculate DEI contributions for top DEI universities by year
filtered_dei = predicted_df[predicted_df['University'].isin(top_dei_unis)]
dei_counts_dei_unis_yearly = filtered_dei.groupby(['Year', 'University'])['Predicted_DEI'].sum().reset_index()
total_dei_tweets = predicted_df['Predicted_DEI'].sum()
dei_counts_dei_unis_yearly['Contribution (%)'] = (dei_counts_dei_unis_yearly['Predicted_DEI'] / total_dei_tweets) * 100

# Calculate DEI contributions for HBCUs by year
filtered_hbcus = predicted_df[predicted_df['University'].isin(hbcu_list)]
dei_counts_hbcus_yearly = filtered_hbcus.groupby(['Year', 'University'])['Predicted_DEI'].sum().reset_index()
dei_counts_hbcus_yearly['Contribution (%)'] = (dei_counts_hbcus_yearly['Predicted_DEI'] / total_dei_tweets) * 100

# Create a line chart for top DEI universities with markers
fig_dei_unis = px.line(dei_counts_dei_unis_yearly,
                        x='Year',
                        y='Contribution (%)',
                        color='University',
                        markers=True,  # Add markers to the line chart
                        title='Contribution of Top 5 DEI Universities to the Overall DEI Conversation by Year',
                        labels={'Contribution (%)': 'Contribution (%)', 'Year': 'Year'})

# Create a line chart for HBCUs with markers
fig_hbcu = px.line(dei_counts_hbcus_yearly,
                    x='Year',
                    y='Contribution (%)',
                    color='University',
                    markers=True,  # Add markers to the line chart
                    title='Contribution of HBCUs to the Overall DEI Conversation by Year',
                    labels={'Contribution (%)': 'Contribution (%)', 'Year': 'Year'})

# Show both figures
fig_dei_unis.show()
fig_hbcu.show()
