In [10]:
import pandas as pd

# Load the dataset
# ... existing code ...
data = pd.read_csv('Exploring Your Future_ Major Selection Survey(Sheet1).csv', encoding='latin-1')
# ... existing code ...

In [11]:
data.head()
# Explore the dataset
print(data.info())
print(data.describe())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 28 columns):
 #   Column                                                                                                                                 Non-Null Count  Dtype  
---  ------                                                                                                                                 --------------  -----  
 0   Id                                                                                                                                     125 non-null    int64  
 1   Start time                                                                                                                             125 non-null    object 
 2   Completion time                                                                                                                        125 non-null    object 
 3   Email                                                                 

In [12]:
# Check for missing values
print(data.isnull().sum())

# Fill missing values with "Unknown" for categorical columns
data.fillna("Unknown", inplace=True)

Id                                                                                                                                         0
Start time                                                                                                                                 0
Completion time                                                                                                                            0
Email                                                                                                                                      0
Name                                                                                                                                     125
Which high school curriculum did you follow?\n                                                                                             0
If you studied the UAE Curriculum, which track did you follow?\n                                                                           0
Which subject

In [24]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# ... existing code ...

# Print the actual column names
print(data.columns.tolist())

# Update the label_encode_columns list with the exact column names
label_encode_columns = [
    'Which high school curriculum did you follow?',
    'If you studied the UAE Curriculum, which track did you follow?',
    'Which subjects did you study in high school? (Select all that apply)',
    'What was your IELTS/TOEFL score?',
    'Which field interests you the most?',
    'What type of career do you envision?',
    'Do you prefer a technical, creative, or business-oriented role?',
    'Would you like to work in a research, corporate, startup, or freelance environment?',
    'How important is job stability in your career choice?',
    'Do you prefer working with people or working with technology?',
    'Do you enjoy working on structured tasks with clear guidelines?',
    'Do you enjoy hands-on work (e.g., lab experiments, building things)?',
    'Do you prefer theoretical learning, practical work, or a mix of both?',
    'Would you rather work on individual projects or team-based assignments?',
    'Do you prefer exams, projects, or research-based assessments?',
    'How comfortable are you with subjects like math and science?',
    'Are you looking for a flexible program where you can specialize later?',
    'Would you prefer a structured degree (e.g., Computer Science, Biotechnology) or one with multiple pathways (e.g., Business, Media)?',
    'Would you like a program with strong industry connections and internship opportunities?',
    'Do you see yourself working in an office, lab, outdoors, or remotely?',
    'Do you want a job with a predictable routine or a dynamic work environment?',
    'How important is global career mobility for you?',
    'Do you want a career that allows remote work?'
]

# Label encode categorical columns
label_encoders = {}
for col in label_encode_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Define features (X) and target (y)
X = data.drop(columns=["Id", "Start time", "Completion time", "Email", "Name", "Which field interests you the most?"])
y = data["Which field interests you the most?"]

# Apply SMOTE for data augmentation
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine augmented data into a new DataFrame
augmented_data = pd.DataFrame(X_resampled, columns=X.columns)
augmented_data["Which field interests you the most?"] = y_resampled

# Save the augmented data
augmented_data.to_csv('augmented_data.csv', index=False)

print("Data augmentation complete. Augmented data saved to 'augmented_data.csv'.")

['Id', 'Start time', 'Completion time', 'Email', 'Name', 'Which high school curriculum did you follow?\n', 'If you studied the UAE Curriculum, which track did you follow?\n', 'Which subjects did you study in high school? (Select all that apply)', 'What was your IELTS/TOEFL score?', 'Which field interests you the most?', 'What type of career do you envision?', 'Do you prefer a technical, creative, or business-oriented role?\n', 'Would you like to work in a research, corporate, startup, or freelance environment?', 'How important is job stability in your career choice?', 'Do you prefer working with people or working with technology?', 'Do you enjoy working on structured tasks with clear guidelines?\n', 'Do you enjoy hands-on work (e.g., lab experiments, building things)?\n', 'Do you prefer theoretical learning, practical work, or a mix of both?\n', 'Would you rather work on individual projects or team-based assignments?', 'Do you prefer exams, projects, or research-based assessments?', 'H

KeyError: 'Which high school curriculum did you follow?'

In [15]:
data = pd.get_dummies(data, columns=label_encode_columns, drop_first=True)


In [16]:
# Define features (X) and target (y)
X = data.drop(columns=["Id", "Start time", "Completion time", "Email", "Name", "Which field interests you the most?"])
y = data["Which field interests you the most?"]

In [17]:
from sklearn.model_selection import train_test_split

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder

# ... existing code ...

# Encode categorical features using OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X)

# Apply SMOTE to generate synthetic samples
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

# Convert back to DataFrame (optional)
X_resampled = pd.DataFrame(X_resampled.toarray(), columns=encoder.get_feature_names_out())

In [19]:
# Save preprocessed data
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ... existing code ...

# First, let's print the actual column names to see what we're working with
print("Column names:", data.columns.tolist())

# Create label encoders for categorical columns
label_encoders = {}
categorical_columns = data.select_dtypes(include=['object']).columns
categorical_columns = [col for col in categorical_columns if col not in ["Id", "Start time", "Completion time", "Email", "Name"]]

# Label encode all categorical columns
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Define features (X) and target (y)
X = data.drop(columns=["Id", "Start time", "Completion time", "Email", "Name", "Which field interests you the most?"])
y = data["Which field interests you the most?"]

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Now train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

Column names: ['Id', 'Start time', 'Completion time', 'Email', 'Name', 'Which high school curriculum did you follow?\n', 'If you studied the UAE Curriculum, which track did you follow?\n', 'Which subjects did you study in high school? (Select all that apply)', 'What was your IELTS/TOEFL score?', 'Which field interests you the most?', 'What type of career do you envision?', 'Do you prefer a technical, creative, or business-oriented role?\n', 'Would you like to work in a research, corporate, startup, or freelance environment?', 'How important is job stability in your career choice?', 'Do you prefer working with people or working with technology?', 'Do you enjoy working on structured tasks with clear guidelines?\n', 'Do you enjoy hands-on work (e.g., lab experiments, building things)?\n', 'Do you prefer theoretical learning, practical work, or a mix of both?\n', 'Would you rather work on individual projects or team-based assignments?', 'Do you prefer exams, projects, or research-based ass

In [21]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [22]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 64.00%


In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# 1. Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Try a more complex Random Forest with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=2
)

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

# Evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}%".format(accuracy * 100))

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=

In [31]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}%".format(accuracy * 100))


Test set accuracy: 52.00%


In [32]:
# After training your model
import pickle

# Save the model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the label encoders
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pickle
import random
from imblearn.over_sampling import SMOTE

# Function to load and preprocess the data
def load_and_preprocess_data(file_path):
    # Try different encodings until one works
    encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    
    for encoding in encodings_to_try:
        try:
            # Try to load with the current encoding
            print(f"Trying encoding: {encoding}")
            df = pd.read_csv(file_path, sep='\t', encoding=encoding)
            print(f"Successfully loaded with encoding: {encoding}")
            break
        except UnicodeDecodeError:
            print(f"Failed with encoding: {encoding}")
            continue
    else:
        # If no encoding worked
        raise ValueError("Could not decode the file with any of the tried encodings")
    
    # Drop unnecessary columns
    columns_to_drop = ['Id', 'Start time', 'Completion time', 'Email', 'Name']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    
    # Convert fields with checkbox responses to dummy variables
    # First, explode the subjects column since it contains multiple values
    subject_col = 'Which subjects did you study in high school? (Select all that apply)'
    if subject_col in df.columns:
        subjects = df[subject_col].str.split(';')
        subject_dummies = pd.get_dummies(subjects.apply(pd.Series).stack()).groupby(level=0).sum()
        subject_dummies.columns = ['Subject_' + col for col in subject_dummies.columns]
        
        # Drop the original subjects column
        df = df.drop(columns=[subject_col])
        
        # Join the dummy variables back to the original dataframe
        df = pd.concat([df, subject_dummies], axis=1)
    
    # Set the target variable (field of interest)
    target_col = 'Which field interests you the most?'
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found in the dataframe")
    
    target = df[target_col]
    
    # Drop the target column from features
    features = df.drop(columns=[target_col])
    
    # Convert categorical variables to numerical using label encoding for ordinal data
    label_encoders = {}
    for column in features.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        features[column] = le.fit_transform(features[column])
        label_encoders[column] = le
    
    return features, target, label_encoders

# Function to directly process the data string provided in the document
def process_data_from_string(data_string):
    # Write the data to a temporary file
    import tempfile
    with tempfile.NamedTemporaryFile(suffix='.txt', delete=False, mode='w', encoding='utf-8') as f:
        f.write(data_string)
        temp_file = f.name
    
    print(f"Created temporary file: {temp_file}")
    
    # Load and process the data
    return load_and_preprocess_data(temp_file)

# Function to augment the data
def augment_data(features, target, n_samples=100):
    """
    Augment the data using SMOTE and random perturbation.
    """
    # Convert features to numeric to ensure SMOTE works
    for col in features.columns:
        features[col] = pd.to_numeric(features[col], errors='coerce')
    
    # Fill NaN values with mean or mode
    for col in features.columns:
        if features[col].isna().any():
            if features[col].dtype == 'object':
                features[col] = features[col].fillna(features[col].mode()[0])
            else:
                features[col] = features[col].fillna(features[col].mean())
    
    # Use SMOTE to generate synthetic samples
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(features, target)
    
    # Add some additional random perturbation to create more variation
    augmented_features = []
    augmented_targets = []
    
    for i in range(len(X_resampled)):
        # Keep original sample
        augmented_features.append(X_resampled.iloc[i].values)
        augmented_targets.append(y_resampled.iloc[i])
        
        # Create perturbed versions if needed to reach n_samples
        if len(augmented_features) < n_samples:
            # Create a perturbed version with small random changes
            perturbed = X_resampled.iloc[i].copy()
            
            # Randomly modify some numeric features (about 30% of features)
            num_to_modify = max(1, int(0.3 * len(perturbed)))
            indices_to_modify = random.sample(range(len(perturbed)), num_to_modify)
            
            for idx in indices_to_modify:
                # Small perturbation, keeping the value within a reasonable range
                if isinstance(perturbed[idx], (int, float)):
                    perturbed[idx] += random.uniform(-0.5, 0.5)
                    # Ensure the perturbed value is not negative for binary features
                    if perturbed[idx] < 0:
                        perturbed[idx] = 0
                    elif perturbed[idx] > 1 and X_resampled.iloc[i, idx] <= 1:
                        perturbed[idx] = 1
            
            augmented_features.append(perturbed.values)
            augmented_targets.append(y_resampled.iloc[i])
    
    # Convert back to DataFrame/Series
    augmented_features_df = pd.DataFrame(augmented_features[:n_samples], columns=features.columns)
    augmented_targets_series = pd.Series(augmented_targets[:n_samples])
    
    return augmented_features_df, augmented_targets_series

# Function to train the model
def train_model(features, target):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.25, random_state=42
    )
    
    # Initialize and train the model
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    )
    
    rf_model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    return rf_model, accuracy, report, X_test, y_test

# Function to save the model
def save_model(model, label_encoders, file_path="major_prediction_model.pkl"):
    model_data = {
        'model': model,
        'label_encoders': label_encoders
    }
    with open(file_path, 'wb') as f:
        pickle.dump(model_data, f)
    print(f"Model saved to {file_path}")

# Function to load the model
def load_model(file_path="major_prediction_model.pkl"):
    with open(file_path, 'rb') as f:
        model_data = pickle.load(f)
    return model_data['model'], model_data['label_encoders']

# Function to make predictions
def predict_major(features, model, label_encoders):
    # Preprocess the input features similar to training data
    for column in features.columns:
        if column in label_encoders:
            features[column] = label_encoders[column].transform(features[column])
    
    # Make prediction
    prediction = model.predict(features)
    
    return prediction



# Main execution
if __name__ == "__main__":
    # Specify the path to your CSV file
    file_path = "Exploring Your Future_ Major Selection Survey(Sheet1).csv"
    
    # Load and preprocess the data
    print("Loading and preprocessing data...")
    features, target, label_encoders = load_and_preprocess_data(file_path)
    
    # Original data stats
    print(f"Original data size: {len(features)}")
    print(f"Original class distribution: {target.value_counts()}")
    
    # Augment the data
    print("\nAugmenting data...")
    augmented_features, augmented_target = augment_data(features, target, n_samples=200)
    
    print(f"Augmented data size: {len(augmented_features)}")
    print(f"Augmented class distribution: {augmented_target.value_counts()}")
    
    # Train the model
    print("\nTraining model...")
    model, accuracy, report, X_test, y_test = train_model(augmented_features, augmented_target)
    
    # Print model evaluation results
    print(f"\nModel accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    
    # Print feature importance
    feature_importance = pd.Series(model.feature_importances_, index=features.columns)
    print("\nTop 10 most important features:")
    print(feature_importance.sort_values(ascending=False).head(10))
    
    # Save the model
    save_model(model, label_encoders)
    
    # Demonstration of using the saved model
    print("\nDemonstration of model prediction:")
    # Use the test set for demonstration
    sample_input = X_test.iloc[0:1]
    actual_major = y_test.iloc[0]
    
    # Load the saved model
    loaded_model, loaded_encoders = load_model()
    
    # Make prediction
    predicted_major = predict_major(sample_input, loaded_model, loaded_encoders)
    
    print(f"Sample input features: {sample_input.values}")
    print(f"Actual major: {actual_major}")
    print(f"Predicted major: {predicted_major[0]}")

Loading and preprocessing data...
Trying encoding: utf-8


ParserError: Error tokenizing data. C error: EOF inside string starting at row 8

In [30]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier as DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import joblib
import chardet
import os
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn import tree

def detect_encoding(file_path):
    """
    Detect the encoding of a file
    """
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

def load_data(file_path):
    """
    Load the data with the correct encoding
    """
    try:
        # Detect the file encoding
        encoding = detect_encoding(file_path)
        print(f"Detected file encoding: {encoding}")
        
        # Read the data file with the detected encoding
        df = pd.read_csv(file_path, encoding=encoding)
        return df
    except Exception as e:
        print(f"Error loading file: {e}")
        return None

def prepare_data(df):
    """
    Prepare data for training a decision tree model
    """
    # Check if best_fit column exists
    if 'best_fit' not in df.columns:
        print("Error: 'best_fit' column not found in dataset")
        return None, None, None, None, None
    
    # Drop rows with missing values in the target column
    df = df.dropna(subset=['best_fit'])
    
    # Print distribution of majors
    print("\nDistribution of majors in dataset:")
    print(df['best_fit'].value_counts())
    
    # Get feature columns - exclude the best_fit column
    # Also exclude any metadata columns like 'id', 'name', etc.
    # Customize this list according to your dataset
    excluded_cols = ['best_fit', 'id', 'name', 'student_id', 'email']
    feature_cols = [col for col in df.columns if col not in excluded_cols]
    
    print(f"\nUsing {len(feature_cols)} features for prediction:")
    print(", ".join(feature_cols))
    
    # Split data into features and target
    X = df[feature_cols]
    y = df['best_fit']
    
    # Check for missing values
    missing_values = X.isnull().sum().sum()
    print(f"\nTotal missing values in features: {missing_values}")
    
    # Identify numeric and categorical columns
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"\nNumeric columns: {len(numeric_cols)}")
    print(f"Categorical columns: {len(categorical_cols)}")
    
    return X, y, feature_cols, numeric_cols, categorical_cols

def create_preprocessing_pipeline(numeric_cols, categorical_cols):
    """
    Create a preprocessing pipeline that handles missing values
    """
    # Preprocessing for numerical data: impute missing values with median
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Preprocessing for categorical data: impute missing values with most frequent value
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    return preprocessor

def augment_data(X, y, preprocessor):
    """
    Preprocess data and augment using SMOTE
    """
    print("\nPreprocessing and augmenting dataset...")
    
    # Fit and transform the data - this will handle missing values
    X_processed = preprocessor.fit_transform(X).toarray()  # Convert to dense
    
    # Apply SMOTE to balance the classes
    smote = SMOTE(random_state=42, k_neighbors=1)  # Adjust k_neighbors to fit available data
    X_resampled, y_resampled = smote.fit_resample(X_processed, y)
    
    print(f"Original dataset size: {len(X)}")
    print(f"Augmented dataset size: {len(X_resampled)}")
    
    return X_resampled, y_resampled

def train_decision_tree(X_train, y_train):
    """
    Train a decision tree classifier
    """
    print("\nTraining decision tree model...")
    
    # Create and train the decision tree
    dt_classifier = DecisionTreeClassifier(
        max_depth=5,  # Prevent overfitting
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    )
    
    dt_classifier.fit(X_train, y_train)
    
    return dt_classifier

def create_complete_pipeline(preprocessor, model):
    """
    Create a full pipeline that includes preprocessing and the model
    """
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    return full_pipeline

def visualize_tree(model, feature_names, class_names):
    """
    Visualize the decision tree
    """
    plt.figure(figsize=(20, 10))
    tree.plot_tree(model, 
                   feature_names=feature_names, 
                   class_names=class_names,
                   filled=True, 
                   rounded=True, 
                   fontsize=10)
    plt.savefig("decision_tree_visualization.png")
    print("Decision tree visualization saved to 'decision_tree_visualization.png'")

def main(file_path):
    """
    Main function to run the entire process
    """
    # Load the data
    df = load_data(file_path)
    if df is None:
        return
    
    # Prepare the data
    X, y, feature_cols, numeric_cols, categorical_cols = prepare_data(df)
    if X is None:
        return
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y if len(y.unique()) > 1 else None
    )
    
    # Create preprocessing pipeline with imputation for missing values
    preprocessor = create_preprocessing_pipeline(numeric_cols, categorical_cols)
    
    # Preprocess and augment the training data
    X_train_augmented, y_train_augmented = augment_data(X_train, y_train, preprocessor)
    
    # Train the model
    model = train_decision_tree(X_train_augmented, y_train_augmented)
    
    # Create a complete pipeline
    pipeline = create_complete_pipeline(preprocessor, model)
    
    # Evaluate on the test set
    test_score = pipeline.score(X_test, y_test)
    print(f"\nAccuracy on test set: {test_score:.4f}")
    
    # Save the model
    joblib.dump(pipeline, "major_prediction_model.joblib")
    print("Model saved as 'major_prediction_model.joblib'")
    
    # Try to get feature names for visualization
    try:
        # Get feature names from one-hot encoder
        ohe_feature_names = []
        for name, trans, cols in preprocessor.transformers_:
            if name == 'cat':
                # Get the one-hot encoder from the pipeline
                ohe = trans.named_steps['onehot']
                ohe_feature_names.extend(ohe.get_feature_names_out(cols))
            elif name == 'num':
                ohe_feature_names.extend(cols)
        
        # Visualize the tree
        visualize_tree(model, ohe_feature_names, list(y.unique()))
    except Exception as e:
        print(f"Could not visualize tree with feature names: {e}")
        # Try to visualize without feature names
        try:
            plt.figure(figsize=(20, 10))
            tree.plot_tree(model, filled=True, rounded=True)
            plt.savefig("decision_tree_visualization.png")
            print("Decision tree visualization saved (without feature names)")
        except Exception as e2:
            print(f"Could not visualize tree: {e2}")
    
    # Create a simple function to make predictions with the saved model
    sample_code = """
    # Example code to use the saved model
    import joblib
    import pandas as pd
    
    # Load the model
    model = joblib.load("major_prediction_model.joblib")
    
    # Prepare input data (must have the same columns as training data)
    # Replace with your actual column names
    sample_data = pd.DataFrame({
        'math_score': [85],
        'science_score': [92],
        'english_score': [78],
        # Add all required columns here
    })
    
    # Make prediction
    predicted_major = model.predict(sample_data)[0]
    print(f"Recommended major: {predicted_major}")
    """
    
    # Save the sample code
    with open("use_model_example.py", "w") as f:
        f.write(sample_code)
    print("Example code for using the model saved as 'use_model_example.py'")

if __name__ == "__main__":
    file_path = "Exploring Your Future_ Major Selection Survey(Sheet1).csv"  # Replace with your file path
    main(file_path)

Detected file encoding: ISO-8859-1

Distribution of majors in dataset:
BACHELOR OF SCIENCE IN BUSINESS ADMINISTRATION                 37
BACHELOR OF SCIENCE IN COMPUTER SCIENCE                        30
BACHELOR OF SCIENCE IN BIOTECHNOLOGY                           27
BACHELOR OF ARTS IN MASS COMMUNICATION                         10
BACHELOR OF SCIENCE IN COMPUTER ENGINEERING                    10
BACHELOR OF SCIENCE IN ARTIFICIAL INTELLIGENCE                  6
BACHELOR OF ARTS IN INTERIOR DESIGN                             3
BACHELOR OF SCIENCE IN CIVIL AND INFRASTRUCTURE ENGINEERING     2
Name: best_fit, dtype: int64

Using 28 features for prediction:
Id, Start time, Completion time, Email, Name, Which high school curriculum did you follow?
, If you studied the UAE Curriculum, which track did you follow?
, Which subjects did you study in high school? (Select all that apply), What was your IELTS/TOEFL score?, Which field interests you the most?, What type of career do you envision?, 




Accuracy on test set: 0.6400
Model saved as 'major_prediction_model.joblib'
Could not visualize tree with feature names: The 'decision_tree' parameter of plot_tree must be an instance of 'sklearn.tree._classes.DecisionTreeClassifier' or an instance of 'sklearn.tree._classes.DecisionTreeRegressor'. Got RandomForestClassifier(max_depth=5, min_samples_leaf=2, min_samples_split=5,
                       random_state=42) instead.
Could not visualize tree: The 'decision_tree' parameter of plot_tree must be an instance of 'sklearn.tree._classes.DecisionTreeClassifier' or an instance of 'sklearn.tree._classes.DecisionTreeRegressor'. Got RandomForestClassifier(max_depth=5, min_samples_leaf=2, min_samples_split=5,
                       random_state=42) instead.
Example code for using the model saved as 'use_model_example.py'


<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

In [36]:
import pandas as pd
import joblib
import os

def load_original_data(file_path):
    """Load the original training data to get exact column names"""
    try:
        # Try to load with detected encoding
        df = pd.read_csv(file_path, sep='\t',encoding='ISO-8859-1',on_bad_lines='skip')  # Using tab separator since your data appears tab-delimited
        print(f"Successfully loaded data with {len(df)} rows and {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"Error loading file: {e}")
        return None

def predict_major_from_input():
    """Collect user input and predict major"""
    try:
        # Load the model
        model = joblib.load("major_prediction_model.joblib")
        print("Model loaded successfully")
        
        # Load the original data to get column names
        file_path = "Exploring Your Future_ Major Selection Survey(Sheet1).csv"
        original_data = load_original_data(file_path)
        
        if original_data is None:
            print("Could not load original data. Cannot continue.")
            return
        
        # Get all columns except the target column 'best_fit'
        input_columns = [col for col in original_data.columns if col != 'best_fit']
        
        # Create a dictionary to hold user input
        user_input = {}
        
        # Example values - these would typically come from user input
        user_answers = {
            'Id': '101',
            'Start time': '2/15/25 10:00',
            'Completion time': '2/15/25 10:05',
            'Email': 'new_student@example.com',
            'Name': 'New Student',
            'Which high school curriculum did you follow?\n': 'UAE Curriculum',
            'If you studied the UAE Curriculum, which track did you follow?\n': 'Scientific',
            'Which subjects did you study in high school? (Select all that apply)': 'Physics;Chemistry;Math;Computer Science;',
            'What was your IELTS/TOEFL score?': 'Above 6.0 (IELTS) / Above 550 (TOEFL-PBT) / Above 79 (TOEFL-IBT)',
            'Which field interests you the most?': 'Technology',
            'What type of career do you envision?': 'Corporate professional',
            'Do you prefer a technical, creative, or business-oriented role?\n': 'Technical',
            'Would you like to work in a research, corporate, startup, or freelance environment?': 'Corporate',
            'How important is job stability in your career choice?': 'Important',
            'Do you prefer working with people or working with technology?': 'Mostly with technology',
            'Do you enjoy working on structured tasks with clear guidelines?\n': 'Yes',
            'Do you enjoy hands-on work (e.g., lab experiments, building things)?\n': 'Yes',
            'Do you prefer theoretical learning, practical work, or a mix of both?\n': 'A mix of both',
            'Would you rather work on individual projects or team-based assignments?': 'A mix of both',
            'Do you prefer exams, projects, or research-based assessments?': 'Projects',
            'How comfortable are you with subjects like math and science?': 'Very comfortable',
            'Are you looking for a flexible program where you can specialize later?\n': 'No',
            'Would you prefer a structured degree (e.g., Computer Science, Biotech- nology) or one with multiple pathways (e.g., Business, Media)?': 'Structured',
            'Would you like a program with strong industry connections and internship opportunities?': 'Yes',
            'Do you see yourself working in an office, lab, outdoors, or remotely?\n': 'Office',
            'Do you want a job with a predictable routine or a dynamic work environ- ment?': 'Somewhat predictable',
            'How important is global career mobility for you?': 'Important',
            'Do you want a career that allows remote work?': 'Yes'
        }
        
        # Fill the dictionary using the EXACT column names from original data
        for col in input_columns:
            if col in user_answers:
                user_input[col] = [user_answers[col]]
            else:
                # For any missing columns, use empty string or appropriate default
                user_input[col] = ['']
                print(f"Warning: No input provided for column: '{col}'")
        
        # Create DataFrame with a single row
        new_student = pd.DataFrame(user_input)
        
        # Make sure column order matches what the model expects
        new_student = new_student[input_columns]
        
        # Make prediction
        predicted_major = model.predict(new_student)[0]
        print(f"\nRecommended major: {predicted_major}")
        
        # Get probability scores for each potential major
        probabilities = model.predict_proba(new_student)
        class_names = model.classes_
        
        # Display probabilities for each potential major
        print("\nConfidence scores for each potential major:")
        for i, major in enumerate(class_names):
            print(f"{major}: {probabilities[0][i]:.2%}")
        
        # Find the top 3 recommended majors
        top_indices = probabilities[0].argsort()[-3:][::-1]
        print("\nTop 3 recommended majors:")
        for i, idx in enumerate(top_indices):
            print(f"{i+1}. {class_names[idx]}: {probabilities[0][idx]:.2%}")
            
        return predicted_major, dict(zip(class_names, probabilities[0]))
            
    except Exception as e:
        print(f"Error during prediction: {e}")
        import traceback
        traceback.print_exc()
        return None, None

def create_interactive_app():
    """Create an interactive version for collecting real user input"""
    # This function would be implemented for a fully interactive app
    # where users are prompted for each question
    
    print("\nExample code for interactive application:")
    print("""
    def collect_user_input(questions, options=None):
        user_input = {}
        
        for question in questions:
            # Display the question and options if available
            print(f"\n{question}")
            if options and question in options:
                for i, opt in enumerate(options[question]):
                    print(f"{i+1}. {opt}")
                    
            # Get user input
            answer = input("Your answer: ")
            user_input[question] = answer
            
        return user_input
    
    # Define questions and options
    questions = [
        "Which high school curriculum did you follow?",
        "If you studied the UAE Curriculum, which track did you follow?",
        # etc.
    ]
    
    options = {
        "Which high school curriculum did you follow?": [
            "UAE Curriculum", 
            "International Baccalaureate (IB)", 
            "British Curriculum", 
            "American Curriculum"
        ],
        # Define options for other questions
    }
    
    # Collect responses
    user_answers = collect_user_input(questions, options)
    
    # Format input for model
    # [Code to properly format for model prediction]
    
    # Make prediction
    predicted_major, probabilities = predict_major_from_input(user_answers)
    """)

if __name__ == "__main__":
    print("Major Prediction Application")
    print("============================")
    predict_major_from_input()
    
    # Uncomment to print code for interactive version
    # create_interactive_app()

Major Prediction Application


Model loaded successfully
Error loading file: Error tokenizing data. C error: EOF inside string starting at row 8
Could not load original data. Cannot continue.
