In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

def predict_team_success(current_teams_path, past_finalists_path):
    # Read the datasets
    current_teams = pd.read_csv(current_teams_path)
    past_finalists = pd.read_csv(past_finalists_path)
    
    # Create features using TF-IDF vectorizer for college names
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    
    # Create training data with both positive and negative examples
    all_colleges = pd.concat([
        pd.DataFrame({'COLLEGE': current_teams['COLLEGE'], 'is_finalist': 0}),
        pd.DataFrame({'COLLEGE': past_finalists['COLLEGE'], 'is_finalist': 1})
    ])
    
    # Create TF-IDF matrix for all colleges
    all_tfidf = vectorizer.fit_transform(all_colleges['COLLEGE'])
    all_labels = all_colleges['is_finalist'].values
    
    # Train Random Forest model on the complete dataset
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(all_tfidf, all_labels)
    
    # Get predictions for current teams
    current_tfidf = vectorizer.transform(current_teams['COLLEGE'])
    college_based_probs = rf_model.predict_proba(current_tfidf)[:, 1]
    
    # Calculate organization success rates
    org_success = {}
    for org in pd.concat([current_teams['ORGANISATION'], past_finalists['ORGANISATION']]).unique():
        total_in_org = len(past_finalists[past_finalists['ORGANISATION'] == org])
        org_success[org] = total_in_org / len(past_finalists) if total_in_org > 0 else 0.1
    
    # Calculate final predictions
    final_predictions = []
    for idx, row in current_teams.iterrows():
        # Get college similarity probability
        college_prob = college_based_probs[idx]
        
        # Get organization historical success rate
        org_prob = org_success.get(row['ORGANISATION'], 0.1)
        
        # Weighted combination (70% college similarity, 30% organization history)
        final_prob = (0.7 * college_prob + 0.3 * org_prob) * 100
        
        # Adjust based on historical selection rate
        historical_rate = len(past_finalists) / (len(past_finalists) + len(current_teams))
        final_prob *= historical_rate * 2  # Multiply by 2 to normalize probabilities
        
        final_predictions.append(min(final_prob, 100))  # Cap at 100%
    
    # Create final dataframe
    result_df = current_teams.copy()
    result_df['Success_Probability'] = final_predictions
    result_df = result_df.sort_values('COLLEGE')
    result_df['Success_Probability'] = result_df['Success_Probability'].apply(lambda x: f"{x:.2f}%")
    
    # Save results
    result_df.to_excel('sih_2024_predictions.xlsx', index=False)
    
    # Print summary statistics
    print("\nPredicted Success Probabilities for SIH 2024 Teams:")
    print(result_df.to_string(index=False))
    print("\nAnalysis Summary:")
    print(f"Number of teams analyzed: {len(result_df)}")
    print(f"Number of unique organizations: {result_df['ORGANISATION'].nunique()}")
    print(f"Number of unique colleges: {result_df['COLLEGE'].nunique()}")
    
    return result_df

# Usage
predictions = predict_team_success('0_modified_combined_final.csv', '0_modified_finalists_final.csv')


Predicted Success Probabilities for SIH 2024 Teams:
                                                           ORGANISATION                               TEAM NAME                           TEAM LEADER NAME                                                                                                                                                                     COLLEGE Success_Probability
                                                  Government of Gujarat                                 SAARTHI                              Divya Kaurani                                                                                                                    030 L.D.R.P. INSTITUTE OF TECH. & RESEARCH., GANDHINAGAR               4.09%
                        National Technical Research Organisation (NTRO)                         CODING BRIGADES                                 Oum Gadani                                                                                                   