In [None]:

import pandas as pd
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib
matplotlib.use('TkAgg')
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import matplotlib.pyplot as plt

# -----------------------
# Utility / pipeline functions (simplified)
# -----------------------

def safe_read_csv(path):
    try:
        return pd.read_csv(path)
    except Exception as e:
        messagebox.showerror('File error', f'Could not read CSV:\n{e}')
        return None


def preprocess_data(interns_df, companies_df):
    interns = interns_df.copy()
    companies = companies_df.copy()

    interns.fillna({'skills': '', 'interest': '', 'location': 'Unknown'}, inplace=True)
    companies.fillna({'requirements': '', 'domain': '', 'location': 'Unknown', 'stipend': 0}, inplace=True)

    def preprocess_text(text):
        if pd.isna(text):
            return ''
        return str(text).lower().strip().replace(',', ' ').replace('/', ' ')

    interns['skills_processed'] = interns['skills'].apply(preprocess_text)
    companies['requirements_processed'] = companies['requirements'].apply(preprocess_text)
    interns['interest'] = interns['interest'].astype(str).str.lower()
    companies['domain'] = companies['domain'].astype(str).str.lower()

    # TF-IDF vectorization
    tfidf = TfidfVectorizer(max_features=100, stop_words='english')
    all_text = pd.concat([interns['skills_processed'], companies['requirements_processed']])
    tfidf.fit(all_text)
    intern_skills_tfidf = tfidf.transform(interns['skills_processed'])
    company_req_tfidf = tfidf.transform(companies['requirements_processed'])

    # scaling (fit separately for stability)
    s_gpa = StandardScaler()
    s_stipend = StandardScaler()
    s_min_gpa = StandardScaler()
    s_age = StandardScaler()
    s_exp = StandardScaler()
    s_dur = StandardScaler()

    interns['gpa_scaled'] = s_gpa.fit_transform(interns[['gpa']]) if 'gpa' in interns.columns else 0
    companies['stipend_scaled'] = s_stipend.fit_transform(companies[['stipend']]) if 'stipend' in companies.columns else 0
    companies['min_gpa_scaled'] = s_min_gpa.fit_transform(companies[['min_gpa']]) if 'min_gpa' in companies.columns else 0
    interns['age_scaled'] = s_age.fit_transform(interns[['age']]) if 'age' in interns.columns else 0
    interns['experience_scaled'] = s_exp.fit_transform(interns[['prior_experience_years']]) if 'prior_experience_years' in interns.columns else 0
    interns['duration_scaled'] = s_dur.fit_transform(interns[['preferred_duration_months']]) if 'preferred_duration_months' in interns.columns else 0

    return interns, companies, intern_skills_tfidf, company_req_tfidf


def create_feature_matrix(interns, companies, intern_skills_tfidf, company_req_tfidf):
    features = []
    for i in range(len(interns)):
        for j in range(len(companies)):
            skills_sim = cosine_similarity(intern_skills_tfidf[i:i+1], company_req_tfidf[j:j+1])[0][0]
            gpa_score = 0
            try:
                if interns.iloc[i]['gpa'] >= companies.iloc[j]['min_gpa']:
                    gpa_score = min((interns.iloc[i]['gpa'] - companies.iloc[j]['min_gpa']) / (10.0 - companies.iloc[j]['min_gpa']), 1.0)
            except Exception:
                gpa_score = 0

            interest_score = 1.0 if interns.iloc[i]['interest'] == companies.iloc[j]['domain'] else \
                             0.5 if (interns.iloc[i]['interest'] in companies.iloc[j]['domain'] or \
                                     companies.iloc[j]['domain'] in interns.iloc[i]['interest']) else 0.0
            location_score = 1.0 if interns.iloc[i]['location'] == companies.iloc[j]['location'] else 0.0
            age_score = min(interns.iloc[i]['age_scaled'] / 25, 1.0) if 'age_scaled' in interns.columns else 0
            experience_score = interns.iloc[i]['experience_scaled'] if 'experience_scaled' in interns.columns else 0
            duration_score = interns.iloc[i]['duration_scaled'] if 'duration_scaled' in interns.columns else 0

            features.append({
                'intern_idx': i,
                'company_idx': j,
                'skills_similarity': skills_sim,
                'gpa_score': gpa_score,
                'interest_score': interest_score,
                'location_score': location_score,
                'age_score': age_score,
                'experience_score': experience_score,
                'duration_score': duration_score
            })
    fm = pd.DataFrame(features)
    X = fm[['skills_similarity', 'gpa_score', 'interest_score', 'location_score', 'age_score', 'experience_score', 'duration_score']]
    return fm, X


def synthesize_target(X):
    # add small noise and construct a synthetic target to train on (same idea as original script)
    np.random.seed(42)
    noise = np.random.normal(0, 5, len(X))
    y = (X['skills_similarity'] * 40 + X['gpa_score'] * 20 + X['interest_score'] * 15 + 
         X['location_score'] * 10 + X['age_score'] * 5 + X['experience_score'] * 5 + 
         X['duration_score'] * 5 + noise)
    return y


def train_and_select_model(X, y):
    models = {
        'Ridge Regression': Ridge(alpha=1.0),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42),
        'Support Vector Regressor': SVR(kernel='rbf', C=1.0, epsilon=0.1)
    }

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    results = []
    for name, model in models.items():
        rmses, maes, r2s = [], [], []
        for train_idx, test_idx in cv.split(X_scaled):
            model.fit(X_scaled.iloc[train_idx], y.iloc[train_idx])
            y_pred = model.predict(X_scaled.iloc[test_idx])
            rmses.append(np.sqrt(mean_squared_error(y.iloc[test_idx], y_pred)))
            maes.append(mean_absolute_error(y.iloc[test_idx], y_pred))
            r2s.append(r2_score(y.iloc[test_idx], y_pred))
        model.fit(X_scaled, y)  # fit final
        results.append({'name': name, 'model': model, 'rmse': np.mean(rmses), 'mae': np.mean(maes), 'r2': np.mean(r2s)})

    results_df = pd.DataFrame(results)
    best_idx = results_df['rmse'].idxmin()
    best = results[best_idx]
    return best, results_df


def allocate_internships(predicted_scores, interns, companies, weight_match=0.7, weight_stipend=0.3):
    allocations = []
    allocated_interns = set()
    company_slots = companies['slots'].tolist() if 'slots' in companies.columns else [1]*len(companies)
    combined_scores = (weight_match * predicted_scores + weight_stipend * (companies['stipend'].values / (companies['stipend'].max() if companies['stipend'].max()>0 else 1)))

    all_matches = [(combined_scores[i, j], i, j) for i in range(predicted_scores.shape[0]) for j in range(predicted_scores.shape[1])]
    all_matches.sort(reverse=True, key=lambda x: x[0])

    for score, intern_idx, company_idx in all_matches:
        if intern_idx not in allocated_interns and company_slots[company_idx] > 0:
            allocations.append({
                'intern_id': interns.iloc[intern_idx].get('intern_id', intern_idx),
                'name': interns.iloc[intern_idx].get('name', f'Intern {intern_idx}'),
                'company_name': companies.iloc[company_idx].get('company_name', f'Company {company_idx}'),
                'match_score': float(predicted_scores[intern_idx, company_idx]),
                'stipend': float(companies.iloc[company_idx].get('stipend', 0)),
                'combined_score': float(score)
            })
            allocated_interns.add(intern_idx)
            company_slots[company_idx] -= 1

    return pd.DataFrame(allocations)

# -----------------------
# GUI
# -----------------------

class InternshipApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title('Internship Allocation - Simple UI')
        self.geometry('1100x700')

        # Data holders
        self.interns_df = None
        self.companies_df = None
        self.intern_skills_tfidf = None
        self.company_req_tfidf = None
        self.feature_matrix = None
        self.X = None
        self.y = None
        self.best_model = None
        self.results_df = None
        self.allocations = None

        # Layout
        self.create_widgets()

    def create_widgets(self):
        # Left control frame
        control = ttk.Frame(self)
        control.pack(side='left', fill='y', padx=8, pady=8)

        ttk.Button(control, text='Load Interns CSV', command=self.load_interns).pack(fill='x', pady=4)
        ttk.Button(control, text='Load Companies CSV', command=self.load_companies).pack(fill='x', pady=4)
        ttk.Separator(control).pack(fill='x', pady=6)
        ttk.Button(control, text='Run EDA', command=self.run_eda).pack(fill='x', pady=4)
        ttk.Button(control, text='Show Plots', command=self.show_plots).pack(fill='x', pady=4)
        ttk.Separator(control).pack(fill='x', pady=6)
        ttk.Button(control, text='Preprocess & Build Features', command=self.build_features).pack(fill='x', pady=4)
        ttk.Button(control, text='Train Models & Select Best', command=self.train_models).pack(fill='x', pady=4)
        ttk.Button(control, text='Run Allocation', command=self.run_allocation).pack(fill='x', pady=4)
        ttk.Separator(control).pack(fill='x', pady=6)
        ttk.Button(control, text='Export Allocations', command=self.export_allocations).pack(fill='x', pady=4)

        self.status_var = tk.StringVar(value='Status: waiting for input')
        ttk.Label(control, textvariable=self.status_var, wraplength=200).pack(fill='x', pady=10)

        # Right notebook for outputs
        self.nb = ttk.Notebook(self)
        self.nb.pack(side='right', expand=True, fill='both')

        # EDA tab
        self.eda_frame = ttk.Frame(self.nb)
        self.nb.add(self.eda_frame, text='EDA / Summary')
        self.eda_text = tk.Text(self.eda_frame)
        self.eda_text.pack(expand=True, fill='both')

        # Plot tab
        self.plot_frame = ttk.Frame(self.nb)
        self.nb.add(self.plot_frame, text='Plots')

        # Allocations tab
        self.alloc_frame = ttk.Frame(self.nb)
        self.nb.add(self.alloc_frame, text='Allocations')
        self.tree = ttk.Treeview(self.alloc_frame, columns=('intern_id','name','company','match','stipend','combined'), show='headings')
        for c in self.tree['columns']:
            self.tree.heading(c, text=c)
        self.tree.pack(expand=True, fill='both')

    # ----------------------- GUI callbacks
    def load_interns(self):
        path = filedialog.askopenfilename(title='Select interns CSV', filetypes=[('CSV files','*.csv'),('All files','*.*')])
        if not path:
            return
        df = safe_read_csv(path)
        if df is not None:
            self.interns_df = df
            self.status_var.set(f'Loaded interns: {len(df)} rows')

    def load_companies(self):
        path = filedialog.askopenfilename(title='Select companies CSV', filetypes=[('CSV files','*.csv'),('All files','*.*')])
        if not path:
            return
        df = safe_read_csv(path)
        if df is not None:
            self.companies_df = df
            self.status_var.set(f'Loaded companies: {len(df)} rows')

    def run_eda(self):
        if self.interns_df is None or self.companies_df is None:
            messagebox.showwarning('Missing data', 'Please load both interns and companies CSVs first.')
            return
        interns = self.interns_df
        companies = self.companies_df
        self.eda_text.delete('1.0', 'end')
        self.eda_text.insert('end', '--- Interns Info ---\n')
        self.eda_text.insert('end', f'{interns.info() if hasattr(interns, "info") else "No info"}\n')
        self.eda_text.insert('end', '\nDescriptive stats for interns:\n')
        self.eda_text.insert('end', interns.describe(include='all').to_string())
        self.eda_text.insert('end', '\n\n--- Companies Info ---\n')
        self.eda_text.insert('end', companies.describe(include='all').to_string())
        # quick insights
        try:
            avg_gpa = interns['gpa'].mean()
            common_interest = interns['interest'].mode()[0]
            common_skill = interns['skills'].str.split(', ').explode().mode()[0]
            total_slots = companies['slots'].sum()
            avg_stipend = companies['stipend'].mean()
            common_domain = companies['domain'].mode()[0]
            insights = f"\n\nBusiness Insights:\n- Average GPA: {avg_gpa:.2f}\n- Most common interest: {common_interest}\n- Most common skill: {common_skill}\n- Total slots: {int(total_slots)}\n- Average stipend: {avg_stipend:.2f}\n- Most common domain: {common_domain}\n"
            self.eda_text.insert('end', insights)
        except Exception:
            pass
        self.status_var.set('EDA complete.')

    def show_plots(self):
        if self.interns_df is None or self.companies_df is None:
            messagebox.showwarning('Missing data', 'Please load both interns and companies CSVs first.')
            return
        for w in self.plot_frame.winfo_children():
            w.destroy()
        fig, axes = plt.subplots(2, 2, figsize=(8,6))
        interns = self.interns_df
        companies = self.companies_df
        try:
            axes[0,0].hist(interns['gpa'].dropna(), bins=10)
            axes[0,0].set_title('GPA Distribution')
        except Exception:
            axes[0,0].text(0.5, 0.5, 'No GPA data', ha='center')
        try:
            interns['interest'].value_counts().nlargest(6).plot(kind='bar', ax=axes[0,1])
            axes[0,1].set_title('Top Interests')
        except Exception:
            axes[0,1].text(0.5, 0.5, 'No interest data', ha='center')
        try:
            companies['domain'].value_counts().nlargest(6).plot(kind='bar', ax=axes[1,0])
            axes[1,0].set_title('Top Company Domains')
        except Exception:
            axes[1,0].text(0.5, 0.5, 'No domain data', ha='center')
        try:
            axes[1,1].scatter(interns['gpa'], interns['prior_experience_years'])
            axes[1,1].set_title('GPA vs Experience')
        except Exception:
            axes[1,1].text(0.5, 0.5, 'No data', ha='center')

        plt.tight_layout()
        canvas = FigureCanvasTkAgg(fig, master=self.plot_frame)
        canvas.get_tk_widget().pack(expand=True, fill='both')
        canvas.draw()
        self.status_var.set('Plots displayed.')

    def build_features(self):
        if self.interns_df is None or self.companies_df is None:
            messagebox.showwarning('Missing data', 'Please load both interns and companies CSVs first.')
            return
        self.interns_df, self.companies_df, self.intern_skills_tfidf, self.company_req_tfidf = preprocess_data(self.interns_df, self.companies_df)
        fm, X = create_feature_matrix(self.interns_df, self.companies_df, self.intern_skills_tfidf, self.company_req_tfidf)
        self.feature_matrix = fm
        self.X = X
        self.status_var.set(f'Feature matrix built: {len(X)} rows')

    def train_models(self):
        if self.X is None:
            messagebox.showwarning('Missing features', 'Please run Preprocess & Build Features first.')
            return
        self.y = synthesize_target(self.X)
        best, results_df = train_and_select_model(self.X, self.y)
        self.best_model = best['model']
        self.results_df = results_df
        self.status_var.set(f"Trained models. Best: {best['name']} (RMSE {best['rmse']:.3f})")
        # show results in EDA tab
        self.eda_text.insert('end', '\n\nModel cross-validation results:\n')
        self.eda_text.insert('end', results_df.to_string())

    def run_allocation(self):
        if self.best_model is None or self.feature_matrix is None:
            messagebox.showwarning('Missing step', 'Please build features and train models first.')
            return
        n_interns = len(self.interns_df)
        n_companies = len(self.companies_df)
        # reconstruct matrices similarly to training
        skills_sim_matrix = cosine_similarity(self.intern_skills_tfidf, self.company_req_tfidf)
        gpa_matrix = np.maximum(0, (self.interns_df['gpa'].values[:, None] - self.companies_df['min_gpa'].values[None, :]) / (10 - self.companies_df['min_gpa'].values[None, :]))
        interest_matrix = np.array([[1 if self.interns_df.iloc[i]['interest']==self.companies_df.iloc[j]['domain']
                                     else 0.5 if self.interns_df.iloc[i]['interest'] in self.companies_df.iloc[j]['domain']
                                     else 0
                                     for j in range(n_companies)]
                                    for i in range(n_interns)])
        location_matrix = (self.interns_df['location'].values[:, None] == self.companies_df['location'].values[None, :]).astype(float)

        age_matrix = np.repeat(self.interns_df['age_scaled'].values[:, None], n_companies, axis=1) / 25
        experience_matrix = np.repeat(self.interns_df['experience_scaled'].values[:, None], n_companies, axis=1)
        duration_matrix = np.repeat(self.interns_df['duration_scaled'].values[:, None], n_companies, axis=1)

        X_vectorized = np.stack([skills_sim_matrix, gpa_matrix, interest_matrix, location_matrix, age_matrix, experience_matrix, duration_matrix], axis=2).reshape(-1,7)
        scaler = StandardScaler()
        # note: best_model was fit to scaled X during training; use same scaling approach
        X_scaled = scaler.fit_transform(self.X)
        # to get predictions, we also scale X_vectorized with same scaler fitted on X
        Xv_scaled = scaler.transform(X_vectorized)
        preds = self.best_model.predict(Xv_scaled).reshape(n_interns, n_companies)

        allocations_df = allocate_internships(preds, self.interns_df, self.companies_df)
        self.allocations = allocations_df

        # populate treeview
        for i in self.tree.get_children():
            self.tree.delete(i)
        if not allocations_df.empty:
            for _, row in allocations_df.iterrows():
                self.tree.insert('', 'end', values=(row['intern_id'], row['name'], row['company_name'], f"{row['match_score']:.3f}", int(row['stipend']), f"{row['combined_score']:.3f}"))
            self.status_var.set(f'Allocation complete: {len(allocations_df)} interns allocated')
        else:
            messagebox.showinfo('No allocations', 'No allocations produced.')
            self.status_var.set('Allocation produced 0 results')

    def export_allocations(self):
        if self.allocations is None or self.allocations.empty:
            messagebox.showwarning('No data', 'No allocations to export. Run allocation first.')
            return
        path = filedialog.asksaveasfilename(defaultextension='.csv', filetypes=[('CSV files','*.csv')])
        if not path:
            return
        self.allocations.to_csv(path, index=False)
        messagebox.showinfo('Exported', f'Allocations exported to {path}')

# -----------------------
# Run app
# -----------------------

if __name__ == '__main__':
    app = InternshipApp()
    app.mainloop()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   intern_id                  1000 non-null   int64  
 1   name                       1000 non-null   object 
 2   age                        1000 non-null   int64  
 3   gpa                        1000 non-null   float64
 4   skills                     1000 non-null   object 
 5   interest                   1000 non-null   object 
 6   location                   1000 non-null   object 
 7   preferred_duration_months  1000 non-null   int64  
 8   prior_experience_years     1000 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 70.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  ----- 