## Pre-processing


In [141]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [142]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [143]:
train_set = pd.read_csv('data//data.csv', delimiter='\t')
print(train_set.head())

  train_set = pd.read_csv('data//data.csv', delimiter='\t')


   R1  R2  R3  R4  R5  R6  R7  R8  I1  I2  ...  orientation  race  voted  \
0   3   4   3   1   1   4   1   3   5   5  ...            1     1      2   
1   1   1   2   4   1   2   2   1   5   5  ...            3     4      1   
2   2   1   1   1   1   1   1   1   4   1  ...            1     4      2   
3   3   1   1   2   2   2   2   2   4   1  ...            1     1      2   
4   4   1   1   2   1   1   1   2   5   5  ...            3     1      2   

   married  familysize  uniqueNetworkLocation  country  source      major  \
0        1           1                      1       US       2        NaN   
1        2           3                      1       US       1    Nursing   
2        1           1                      1       US       1        NaN   
3        1           1                      1       CN       0        NaN   
4        1           4                      1       PH       0  education   

   Unnamed: 93  
0          NaN  
1          NaN  
2          NaN  
3          N

## 1. Handle duplicate

In [144]:
train_set.duplicated().sum()

0

## 2. Check null

In [145]:
import pandas as pd


# Check for null values in each column
null_columns = train_set.columns[train_set.isnull().any()]

# Display columns with null values
print("Columns with null values:")
for col in null_columns:
    print(col)


Columns with null values:
country
major
Unnamed: 93


There is no null values in respondent's answer. Country can be dropped (not relevant) and Major will be pre-processed later.

In [146]:
train_set = train_set.drop(columns=['country'])
train_set = train_set.dropna(subset=['major'])
train_set = train_set.drop(columns=['Unnamed: 93'])

## 3. Correlation

In [147]:
label_encoder = LabelEncoder()
train_set['major'] = label_encoder.fit_transform(train_set['major'])

In [148]:
correlation_with_main = train_set.corr()['major']
sorted_correlation = correlation_with_main.sort_values(ascending=False)

# Display correlation with main attribute for all other attributes
print("Correlation with", 'major', "for all other attributes:")
print(sorted_correlation)

Correlation with major for all other attributes:
major     1.000000
S3        0.096814
S5        0.085407
gender    0.074262
S8        0.062371
            ...   
VCL2     -0.070979
VCL5     -0.074718
E5       -0.075433
C6       -0.076737
VCL13    -0.097827
Name: major, Length: 92, dtype: float64


# Filter untuk masing masing pertanyaan RIASEC 

In [149]:
r_columns = [col for col in sorted_correlation.index if col.startswith('R')]
r_correlations = sorted_correlation[r_columns]
r_correlations_sorted = r_correlations.reindex(r_correlations.abs().sort_values(ascending=False).index)
print(r_correlations_sorted)

R4   -0.063783
R6   -0.044292
R2   -0.030956
R7   -0.030565
R8   -0.028638
R1   -0.026375
R3   -0.024054
R5   -0.016933
Name: major, dtype: float64


In [150]:
i_columns = [col for col in sorted_correlation.index if col.startswith('I')]
i_correlations = sorted_correlation[i_columns]
i_correlations_sorted = i_correlations.reindex(i_correlations.abs().sort_values(ascending=False).index)
print(i_correlations_sorted)

I8   -0.046517
I1    0.035950
I4    0.033164
I7    0.013177
I2    0.012164
I5    0.008699
I6   -0.006355
I3   -0.005188
Name: major, dtype: float64


In [151]:
a_columns = [col for col in sorted_correlation.index if col.startswith('A')]
a_correlations = sorted_correlation[a_columns]
a_correlations_sorted = a_correlations.reindex(a_correlations.abs().sort_values(ascending=False).index)
print(a_correlations_sorted)

A6   -0.039260
A8   -0.033771
A5   -0.032443
A4   -0.029286
A3   -0.028107
A2   -0.022327
A1   -0.020827
A7   -0.016896
Name: major, dtype: float64


In [152]:
s_columns = [col for col in sorted_correlation.index if col.startswith('S')]
s_correlations = sorted_correlation[s_columns]
s_correlations_sorted = s_correlations.reindex(s_correlations.abs().sort_values(ascending=False).index)
print(s_correlations_sorted)

S3    0.096814
S5    0.085407
S8    0.062371
S6    0.042938
S7    0.014250
S1    0.006551
S4    0.001472
S2   -0.001219
Name: major, dtype: float64


In [153]:
e_columns = [col for col in sorted_correlation.index if col.startswith('E')]
e_correlations = sorted_correlation[e_columns]
e_correlations_sorted = e_correlations.reindex(e_correlations.abs().sort_values(ascending=False).index)
print(e_correlations_sorted)

E5   -0.075433
E3   -0.050543
E7   -0.034889
E8   -0.033562
E1   -0.031490
E4    0.019264
E6   -0.011198
E2   -0.007715
Name: major, dtype: float64


In [154]:
c_columns = [col for col in sorted_correlation.index if col.startswith('C')]
c_correlations = sorted_correlation[c_columns]
c_correlations_sorted = c_correlations.reindex(c_correlations.abs().sort_values(ascending=False).index)
print(c_correlations_sorted)

C6   -0.076737
C7   -0.067660
C5   -0.067407
C3   -0.066505
C8   -0.052127
C2   -0.043965
C1   -0.041057
C4   -0.034972
Name: major, dtype: float64


# Filter VCL Questions

In [155]:
vcl_columns = [col for col in sorted_correlation.index if col.startswith('VCL')]
vcl_correlations = sorted_correlation[vcl_columns]
vcl_correlations_sorted = vcl_correlations.reindex(vcl_correlations.abs().sort_values(ascending=False).index)
print(vcl_correlations_sorted)

VCL13   -0.097827
VCL5    -0.074718
VCL2    -0.070979
VCL14   -0.061739
VCL15   -0.058985
VCL3    -0.046890
VCL4    -0.045311
VCL10   -0.042534
VCL11   -0.037396
VCL1    -0.035642
VCL16   -0.023088
VCL12   -0.021183
VCL8     0.013887
VCL7    -0.011596
VCL6    -0.007161
VCL9     0.006514
Name: major, dtype: float64


# Filter TIPI

In [156]:
tipi_columns = [col for col in sorted_correlation.index if col.startswith('TIPI')]
tipi_correlations = sorted_correlation[tipi_columns]
tipi_correlations_sorted = tipi_correlations.reindex(tipi_correlations.abs().sort_values(ascending=False).index)
print(tipi_correlations_sorted)

TIPI7     0.029723
TIPI5    -0.022685
TIPI2    -0.014648
TIPI10    0.012859
TIPI4     0.009722
TIPI3    -0.007150
TIPI1     0.006641
TIPI9    -0.003497
TIPI8     0.003417
TIPI6    -0.002580
Name: major, dtype: float64


# Filter Pertanyaan diluar RIASEC, VCL, TIPI

In [157]:
# Define the letters to exclude
exclude_letters = ['R', 'I', 'A', 'S', 'E', 'C', 'TIPI', 'VCL']  

#Filter columns that do not start with the specified letters
filtered_columns = [col for col in sorted_correlation.index if not any(col.startswith(letter) for letter in exclude_letters)]

# Create a DataFrame for filtered columns and their correlations
filtered_correlations = sorted_correlation[filtered_columns]

# Sort the filtered correlations
filtered_correlations_sorted = filtered_correlations.reindex(filtered_correlations.abs().sort_values(ascending=False).index)

# Display the sorted correlations
print(filtered_correlations_sorted)

major                    1.000000
gender                   0.074262
education               -0.059594
religion                 0.053448
uniqueNetworkLocation    0.039981
engnat                   0.031636
voted                    0.031209
orientation              0.023703
married                 -0.012725
race                     0.011715
urban                   -0.006309
introelapse             -0.005920
familysize               0.005554
surveyelapse            -0.004353
age                      0.002779
testelapse              -0.002557
source                  -0.001790
hand                     0.001122
Name: major, dtype: float64


## Drop Question Based on Correlation

In [158]:
train_set = train_set.drop(columns=['R3'])
train_set = train_set.drop(columns=['R5'])
train_set = train_set.drop(columns=['I3'])
train_set = train_set.drop(columns=['I6'])
train_set = train_set.drop(columns=['A7'])
train_set = train_set.drop(columns=['A1'])
train_set = train_set.drop(columns=['S2'])
train_set = train_set.drop(columns=['S4'])
train_set = train_set.drop(columns=['E2'])
train_set = train_set.drop(columns=['E6'])
train_set = train_set.drop(columns=['C4'])
train_set = train_set.drop(columns=['C1'])
train_set = train_set.drop(columns=['VCL9'])
train_set = train_set.drop(columns=['VCL7'])
train_set = train_set.drop(columns=['VCL8'])
train_set = train_set.drop(columns=['VCL16'])
train_set = train_set.drop(columns=['surveyelapse'])
train_set = train_set.drop(columns=['testelapse'])
train_set = train_set.drop(columns=['introelapse'])
train_set = train_set.drop(columns=['source'])
train_set = train_set.drop(columns=['uniqueNetworkLocation'])
train_set = train_set.drop(columns=['hand'])
train_set = train_set.drop(columns=['age'])
train_set = train_set.drop(columns=['familysize'])
train_set = train_set.drop(columns=['urban'])
train_set = train_set.drop(columns=['race'])
train_set = train_set.drop(columns=['married'])
train_set = train_set.drop(columns=['orientation'])

## Dealing with Major

In [159]:
# DONT DO IT AGAIN AFTER RUN THE NEXT CODE

trainmajor_df = pd.read_csv('data//data.csv', delimiter='\t')

  trainmajor_df = pd.read_csv('data//data.csv', delimiter='\t')


In [160]:
trainmajor_df = trainmajor_df.drop(columns=['country'])
trainmajor_df = trainmajor_df.dropna(subset=['major'])
trainmajor_df = trainmajor_df.drop(columns=['Unnamed: 93'])

In [161]:
trainmajor_df = trainmajor_df.drop(columns=['R3'])
trainmajor_df = trainmajor_df.drop(columns=['R5'])
trainmajor_df = trainmajor_df.drop(columns=['I3'])
trainmajor_df = trainmajor_df.drop(columns=['I6'])
trainmajor_df = trainmajor_df.drop(columns=['A7'])
trainmajor_df = trainmajor_df.drop(columns=['A1'])
trainmajor_df = trainmajor_df.drop(columns=['S2'])
trainmajor_df = trainmajor_df.drop(columns=['S4'])
trainmajor_df = trainmajor_df.drop(columns=['E2'])
trainmajor_df = trainmajor_df.drop(columns=['E6'])
trainmajor_df = trainmajor_df.drop(columns=['C4'])
trainmajor_df = trainmajor_df.drop(columns=['C1'])
trainmajor_df = trainmajor_df.drop(columns=['VCL9'])
trainmajor_df = trainmajor_df.drop(columns=['VCL7'])
trainmajor_df = trainmajor_df.drop(columns=['VCL8'])
trainmajor_df = trainmajor_df.drop(columns=['VCL16'])
trainmajor_df = trainmajor_df.drop(columns=['surveyelapse'])
trainmajor_df = trainmajor_df.drop(columns=['testelapse'])
trainmajor_df = trainmajor_df.drop(columns=['introelapse'])
trainmajor_df = trainmajor_df.drop(columns=['source'])
trainmajor_df = trainmajor_df.drop(columns=['uniqueNetworkLocation'])
trainmajor_df = trainmajor_df.drop(columns=['hand'])
trainmajor_df = trainmajor_df.drop(columns=['age'])
trainmajor_df = trainmajor_df.drop(columns=['familysize'])
trainmajor_df = trainmajor_df.drop(columns=['urban'])
trainmajor_df = trainmajor_df.drop(columns=['race'])
trainmajor_df = trainmajor_df.drop(columns=['married'])
trainmajor_df = trainmajor_df.drop(columns=['orientation'])

In [166]:
# Menghitung jumlah kemunculan setiap major
major_counts = trainmajor_df['major'].value_counts()

# Mengurutkan major berdasarkan jumlah kemunculan
sorted_major_counts = major_counts.sort_values(ascending=False)

# Memilih 70 major teratas
top_major_counts = sorted_major_counts.head(220)

# Menampilkan 70 major dominan beserta jumlah kemunculannya
for major, count in top_major_counts.items():
    print(f"{major}: {count}")


psychology: 6861
Psychology: 5763
English: 2342
Business: 2290
Biology: 1289
Nursing: 1275
business: 1166
Education: 1162
nursing: 839
Psychology : 821
engineering: 773
Accounting: 730
Economics: 730
civil engineering: 675
Law: 667
biology: 655
english: 649
Computer Science: 643
History: 631
education: 607
Marketing: 571
Engineering: 555
Finance: 532
Sociology: 530
Business Administration: 481
Criminal Justice: 477
Management: 443
accounting: 437
Medicine: 420
law: 400
Communications: 394
computer science: 365
Political Science: 351
no: 350
Business Management: 337
Chemistry: 331
Social Work: 319
Music: 319
Counseling: 317
Science: 309
Art: 304
Mathematics: 304
sociology: 301
Communication: 293
medicine: 292
economics: 291
management: 288
Mechanical Engineering: 287
Physics: 282
criminal justice: 281
Business : 276
psychology : 274
mechanical engineering: 273
counseling: 266
history: 266
Architecture: 261
marketing: 252
Philosophy: 245
Journalism: 242
social work: 227
none: 217
science