In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('career_recommender.csv')
df.head()

In [5]:
df.columns

Index(['What is your name?', 'What is your gender?',
       'What was your course in UG?',
       'What is your UG specialization? Major Subject (Eg; Mathematics)',
       'What are your interests?',
       'What are your skills ? (Select multiple if necessary)',
       'What was the average CGPA or Percentage obtained in under graduation?',
       'Did you do any certification courses additionally?',
       'If yes, please specify your certificate course title.',
       'Are you working?',
       'If yes, then what is/was your first Job title in your current field of work? If not applicable, write NA.               ',
       'Have you done masters after undergraduation? If yes, mention your field of masters.(Eg; Masters in Mathematics)'],
      dtype='object')

In [7]:
df.isnull().sum()

What is your name?                                                                                                            0
What is your gender?                                                                                                          0
What was your course in UG?                                                                                                   0
What is your UG specialization? Major Subject (Eg; Mathematics)                                                               0
What are your interests?                                                                                                      0
What are your skills ? (Select multiple if necessary)                                                                         1
What was the average CGPA or Percentage obtained in under graduation?                                                         0
Did you do any certification courses additionally?                                                      

In [9]:
df['What are your skills ? (Select multiple if necessary)'].mode()

0    NO
Name: What are your skills ? (Select multiple if necessary), dtype: object

In [11]:
# 1. Data Loading and Preprocessing
def load_and_preprocess_data(file_path):
    """
    Load and preprocess the career dataset
    """
    # Load data
    df = pd.read_csv(file_path)
    
    # Clean column names
    df.columns = [col.strip() for col in df.columns]
    
    # Fill NA/NaN values
    df = df.fillna('Not Applicable')
    
    return df

In [13]:
# 2. Feature Engineering
def engineer_features(df):
    """
    Convert categorical variables to numerical and prepare features for similarity matching
    """
    # Initialize LabelEncoder
    le = LabelEncoder()
    
    # Encode categorical columns
    categorical_cols = ['What is your gender?', 
                       'What was your course in UG?',
                       'What is your UG specialization? Major Subject (Eg; Mathematics)',
                       'Did you do any certification courses additionally?',
                       'Are you working?',
                       'Have you done masters after undergraduation? If yes, mention your field of masters.(Eg; Masters in Mathematics)']
    
    for col in categorical_cols:
        df[col + '_encoded'] = le.fit_transform(df[col])
    
    # Process skills and interests
    def split_and_encode(text):
        if isinstance(text, str):
            return text.split(';')
        return []

    df['skills_list'] = df['What are your skills ? (Select multiple if necessary)'].apply(split_and_encode)
    df['interests_list'] = df['What are your interests?'].apply(split_and_encode)
    
    # Create feature matrix
    feature_matrix = pd.DataFrame()
    feature_matrix['gender'] = df['What is your gender?_encoded']
    feature_matrix['course'] = df['What was your course in UG?_encoded']
    feature_matrix['specialization'] = df['What is your UG specialization? Major Subject (Eg; Mathematics)_encoded']
    feature_matrix['cgpa'] = pd.to_numeric(df['What was the average CGPA or Percentage obtained in under graduation?'], errors='coerce')
    feature_matrix['certification'] = df['Did you do any certification courses additionally?_encoded']
    feature_matrix['working'] = df['Are you working?_encoded']
    feature_matrix['masters'] = df['Have you done masters after undergraduation? If yes, mention your field of masters.(Eg; Masters in Mathematics)_encoded']
    
    # Normalize features
    scaler = StandardScaler()
    feature_matrix_scaled = scaler.fit_transform(feature_matrix)

    return feature_matrix_scaled, df

In [15]:
# 3. Career Recommendation Function
def recommend_career(user_inputs, feature_matrix_scaled, original_df):
    """
    Recommend careers based on user inputs and similarity matching
    """
    # Convert user inputs to feature vector
    user_vector = np.zeros((1, feature_matrix_scaled.shape[1]))
    
    # Map user inputs to feature vector (this would need to be aligned with the feature engineering process)
    # This is a simplified version - you'd need to use the same encoding scheme as in feature engineering
    user_vector[0, 0] = 1 if user_inputs['gender'].lower() == 'male' else 0  # gender
    user_vector[0, 1] = 0  # course (simplified)
    user_vector[0, 2] = 0  # specialization (simplified)
    user_vector[0, 3] = float(user_inputs['cgpa'])  # cgpa
    user_vector[0, 4] = 1 if user_inputs['certification'].lower() == 'yes' else 0  # certification
    user_vector[0, 5] = 1 if user_inputs['working'].lower() == 'yes' else 0  # working
    user_vector[0, 6] = 1 if user_inputs['masters'].lower() != 'not applicable' else 0  # masters
    
    # Calculate similarity scores
    similarities = cosine_similarity(user_vector, feature_matrix_scaled)
    
    # Get top 3 similar profiles
    top_matches_idx = similarities[0].argsort()[-3:][::-1]
    
    recommendations = []
    for idx in top_matches_idx:
        career_info = {
            'name': original_df.iloc[idx]['What is your name?'],
            'job_title': original_df.iloc[idx]['If yes, then what is/was your first Job title in your current field of work? If not applicable, write NA.'],
            'similarity_score': similarities[0][idx]
        }
        recommendations.append(career_info)
    
    return recommendations

In [17]:
# 4. User Input Collection
def get_user_inputs():
    """
    Collect user inputs through interactive prompts
    """
    questions = {
        'name': 'What is your name?',
        'gender': 'What is your gender? (Male/Female)',
        'course': 'What was your course in UG?',
        'specialization': 'What is your UG specialization?',
        'interests': 'What are your interests? (separate multiple interests with semicolons)',
        'skills': 'What are your skills? (separate multiple skills with semicolons)',
        'cgpa': 'What was your average CGPA or Percentage in under graduation?',
        'certification': 'Did you do any certification courses additionally? (Yes/No)',
        'cert_title': 'If yes, please specify your certificate course title',
        'working': 'Are you working? (Yes/No)',
        'job_title': 'If yes, what is your current job title? (write NA if not applicable)',
        'masters': 'Have you done masters after undergraduation? If yes, mention your field'
    }
    
    user_inputs = {}
    for key, question in questions.items():
        user_inputs[key] = input(f"{question}: ")
    
    return user_inputs


In [None]:
# Example usage
if __name__ == "__main__":
    # Load and preprocess data
    df = load_and_preprocess_data('career_recommender.csv')
    
    # Engineer features
    feature_matrix_scaled, processed_df = engineer_features(df)
    
    # Get user inputs
    user_inputs = get_user_inputs()
    
    # Get recommendations
    recommendations = recommend_career(user_inputs, feature_matrix_scaled, processed_df)
    
    # Display recommendations
    print("\nBased on your profile, here are the recommended career paths:")
    for i, rec in enumerate(recommendations, 1):
        print(f"\n{i}. Recommended Career Path:")
        print(f"Based on similar profile: {rec['name']}")
        print(f"Recommended Job Role: {rec['job_title']}")
        print(f"Similarity Score: {rec['similarity_score']:.2f}")

In [11]:
import pandas as pd

# Load the CSV file
csv_file = 'career_recommender.csv' 
json_file = 'career.json'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file)

# Convert DataFrame to JSON and save to file
df.to_json(json_file, orient='records', lines=True)

print(f"CSV file '{csv_file}' has been converted to JSON file '{json_file}'")


CSV file 'career_recommender.csv' has been converted to JSON file 'career.json'


In [9]:
with open('career.json', 'r') as f:
    data = json.load(f)

print(data)

JSONDecodeError: Extra data: line 2 column 1 (char 799)