# Analyze placementdata - 10k.csv
Clean dataset to have numbers and normalize each column

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
datasetFileName = "../data/placementdata - 10k.csv"

placementDataFrame = pd.read_csv(datasetFileName)

placementDataFrame.head()


Unnamed: 0,StudentID,CGPA,Internships,Projects,Workshops/Certifications,AptitudeTestScore,SoftSkillsRating,ExtracurricularActivities,PlacementTraining,SSC_Marks,HSC_Marks,PlacementStatus
0,1,7.5,1,1,1,65,4.4,No,No,61,79,NotPlaced
1,2,8.9,0,3,2,90,4.0,Yes,Yes,78,82,Placed
2,3,7.3,1,2,2,82,4.8,Yes,No,79,80,NotPlaced
3,4,7.5,1,1,2,85,4.4,Yes,Yes,81,80,Placed
4,5,8.3,1,2,2,86,4.5,Yes,Yes,74,88,Placed


In [3]:
numericColumns = []
nonNumericColumns = []

for c in placementDataFrame.columns[1:]: #Exclude StudentID
    if(placementDataFrame[c].dtype == "object"):
        nonNumericColumns.append(c)
    else:
        numericColumns.append(c)

averages = {}
for c in numericColumns:
    l = len(placementDataFrame[c])
    s = 0
    for i in range(l):
        s += placementDataFrame[c][i]
    averages[c] = s/l

labelPercentages = {}
for c in nonNumericColumns:
    l = len(placementDataFrame[c])
    labels = {}
    for i in range(l):
        val = placementDataFrame[c][i]
        if val not in labels:
            labels[val] = 1
        else:
            labels[val] += 1
    
    s = 0
    for label in labels:
        s += labels[label]
    for label in labels:
        labels[label] = (labels[label]/s)*100

    labelPercentages[c] = labels


print("Averages:")
for c in averages:
    print(f"{c} - {averages[c]}")
print()
print("Percentages")


for c in labelPercentages:
    labels = labelPercentages[c]
    print(c)
    for label in labels:
        print(f"{label} - {labels[label]}", end="  ")
    print()

Averages:
CGPA - 7.69800999999997
Internships - 1.0492
Projects - 2.0266
Workshops/Certifications - 1.0132
AptitudeTestScore - 79.4499
SoftSkillsRating - 4.3239600000000395
SSC_Marks - 69.1594
HSC_Marks - 74.5015

Percentages
ExtracurricularActivities
No - 41.46  Yes - 58.540000000000006  
PlacementTraining
No - 26.82  Yes - 73.18  
PlacementStatus
NotPlaced - 58.03  Placed - 41.97  


# Data Analysis
## Averages:
- CGPA - 7.69800999999997
- Internships - 1.0492
- Projects - 2.0266
- Workshops/Certifications - 1.0132
- AptitudeTestScore - 79.4499
- SoftSkillsRating - 4.3239600000000395
- SSC_Marks - 69.1594
- HSC_Marks - 74.5015

## Percentages
### ExtracurricularActivities
 - No - 41.46  Yes - 58.540000000000006  
### PlacementTraining
 - No - 26.82  Yes - 73.18  
### PlacementStatus
 - NotPlaced - 58.03  Placed - 41.97 

In [4]:
# Mapping text to numbers

placementDataFrame["ExtracurricularActivities"] = (
    placementDataFrame["ExtracurricularActivities"]
    .str.strip()         
    .str.lower()         
    .map({"yes": 1, "no": 0})
)

placementDataFrame["PlacementTraining"] = (
    placementDataFrame["PlacementTraining"]
    .str.strip()         
    .str.lower()         
    .map({"yes": 1, "no": 0})
)

placementDataFrame["PlacementStatus"] = (
    placementDataFrame["PlacementStatus"]
    .str.strip()         
    .str.lower()         
    .map({"notplaced": 0 , "placed": 1})
)

# Normalizing all columns to be between 0 and 1
scaler = MinMaxScaler()
numeric_cols = ['CGPA', 'Internships', 'Projects', 'Workshops/Certifications',
                'AptitudeTestScore', 'SoftSkillsRating', 'SSC_Marks', 'HSC_Marks']

placementDataFrame[numeric_cols] = scaler.fit_transform(placementDataFrame[numeric_cols])


In [5]:
# Assuming df is your DataFrame

# Drop StudentID (not a predictive feature)
X = placementDataFrame.drop(columns=["StudentID", "PlacementStatus", "AptitudeTestScore", "Workshops/Certifications", ])
y = placementDataFrame["PlacementStatus"]

print(X.head())

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


       CGPA  Internships  Projects  SoftSkillsRating  \
0  0.384615          0.5  0.333333          0.777778   
1  0.923077          0.0  1.000000          0.555556   
2  0.307692          0.5  0.666667          1.000000   
3  0.384615          0.5  0.333333          0.777778   
4  0.692308          0.5  0.666667          0.833333   

   ExtracurricularActivities  PlacementTraining  SSC_Marks  HSC_Marks  
0                          0                  0   0.171429   0.709677  
1                          1                  1   0.657143   0.806452  
2                          1                  0   0.685714   0.741935  
3                          1                  1   0.742857   0.741935  
4                          1                  1   0.542857   1.000000  
Accuracy: 0.785
              precision    recall  f1-score   support

           0       0.83      0.80      0.81      1172
           1       0.73      0.76      0.75       828

    accuracy                           0.79      20

## Different Models

### Logistic Regression Model
- 79% accuracy with Logistic Regression Model excluding features which are dropped

# Analyse student career suggestion - 20k.csv

In [3]:
import pandas as pd

suggestionDataFileName = "../data/student career suggestion - 20k.csv"

suggestionDF = pd.read_csv(suggestionDataFileName)

suggestionDF.head()


Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,69,63,78,87,94,94,87,84,61,9,...,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,no,Database Developer
1,78,62,73,60,71,70,73,84,91,12,...,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,yes,Portal Administrator
2,71,86,91,87,61,81,72,72,94,11,...,Travel,Work,no,stubborn,Management,work,hard worker,no,yes,Portal Administrator
3,76,87,60,84,89,73,62,88,69,7,...,Romance,Work,yes,gentle,Management,work,smart worker,yes,yes,Systems Security Administrator
4,92,62,90,67,71,89,73,71,73,4,...,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,yes,Business Systems Analyst
