In [181]:
import numpy as np
import pandas as pd

#### Load Data from the Excel File

In [182]:
survey_data = pd.read_csv("major_observation.csv")

#### Covert Data Answer from YES / NO to 1 / 0

In [None]:
survey_data.replace({'yes': 1, 'no': 0}, inplace=True)

survey_data.head()

##### Replace the null value of age with mean()


In [184]:
survey_data.fillna(survey_data.mean(numeric_only=True), inplace=True)

#### Exploratory Data Analysis (EDA)

In [185]:
# Import visualization libraries
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Display summary statistics
# print(survey_data.describe())

# # Generate a heatmap of correlations
# plt.figure(figsize=(12, 10))
# sns.heatmap(survey_data.corr(), annot=True, fmt=".2f")
# plt.show()

In [186]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Display summary statistics
# print(survey_data.describe())

# # Generate a heatmap of correlations
# plt.figure(figsize=(12, 10))
# sns.heatmap(survey_data.corr(), annot=True, fmt=".2f")
# plt.show()

#### Identify the most important features that influence the choice of major.

In [None]:
# Using feature importance from a simple tree-based model
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(survey_data.drop('Major', axis=1), survey_data['Major'])

# Plot feature importances
feat_importances = pd.Series(model.feature_importances_, index=survey_data.columns[:-1])
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

#### Select and train a machine learning model for classification.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


# Encode the 'Major' column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
survey_data['Major'] = le.fit_transform(survey_data['Major'])

# Split data into features and target
X = survey_data.drop('Major', axis=1)
y = survey_data['Major']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)


#### Evaluate the accuracy 

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the testing set
y_pred = lr_model.predict(X_test)

# Calculate and print accuracy and classification report
print(f'Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%')
print(classification_report(y_test, y_pred))


#### Recommendation System

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

#  function to predict a major
def predict_major(input_features):
    input_features = np.array(input_features).reshape(1, -1) 
    predicted_class = lr_model.predict(input_features)
    return le.inverse_transform(predicted_class)[0]  

new_survey=[1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0]
recommended_major = predict_major([new_survey])

print(f'Recommended Major: {recommended_major}')