# Tasks – Data Cleaning, Missing Data, Outliers  (Level 1)

## Part 1 – Data Cleaning

### 1.Check dataset structure

In [1169]:
import pandas as pd

df = pd.read_csv("bi.csv", encoding="latin1")  # Load CSV file into a DataFrame

df  # Display rows

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB
0,Christina,Binger,44,Female,Norway,Private,72,Masters,158,59.0,55
1,Alex,Walekhwa,60,M,Kenya,Private,79,Diploma,150,60.0,75
2,Philip,Leo,25,Male,Uganda,Sognsvann,55,HighSchool,130,74.0,50
3,Shoni,Hlongwane,22,F,Rsa,Sognsvann,40,High School,120,,44
4,Maria,Kedibone,23,Female,South Africa,Sognsvann,65,High School,122,91.0,80
...,...,...,...,...,...,...,...,...,...,...,...
72,Clara,Bernard,43,Female,France,Private,80,Bachelors,150,75.0,43
73,Julian,Nielsen,31,Male,Denmark,Sognsvann,90,Masters,158,84.0,83
74,Sophie,Brown,33,Female,UK,Sognsvann,96,Masters,158,85.0,90
75,Leon,Bauer,35,Male,Germany,Sognsvann,90,Masters,160,87.0,74


In [1170]:
df.shape

(77, 11)

#### Apply info() to know number of non-null values 

In [1171]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fNAME          77 non-null     object 
 1   lNAME          77 non-null     object 
 2   Age            77 non-null     int64  
 3   gender         77 non-null     object 
 4   country        77 non-null     object 
 5   residence      77 non-null     object 
 6   entryEXAM      77 non-null     int64  
 7   prevEducation  77 non-null     object 
 8   studyHOURS     77 non-null     int64  
 9   Python         75 non-null     float64
 10  DB             77 non-null     int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 6.7+ KB


In [1172]:
df.head()

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB
0,Christina,Binger,44,Female,Norway,Private,72,Masters,158,59.0,55
1,Alex,Walekhwa,60,M,Kenya,Private,79,Diploma,150,60.0,75
2,Philip,Leo,25,Male,Uganda,Sognsvann,55,HighSchool,130,74.0,50
3,Shoni,Hlongwane,22,F,Rsa,Sognsvann,40,High School,120,,44
4,Maria,Kedibone,23,Female,South Africa,Sognsvann,65,High School,122,91.0,80


#### Question: Which columns should be categorical and which should be numerical?

##### Answer: Age, entryEXAM, studyHOURS, Python, DB Are Numerical Columns but gender, country, residence, prevEducation Are Categorical Columns            

### 2.Detect inconsistent categories

#### apply unique() to know the unique values in each column

In [1173]:
df['gender'].unique()

array(['Female', 'M', 'Male', 'F', 'female', 'male'], dtype=object)

In [1174]:
df['country'].unique()

array(['Norway', 'Kenya', 'Uganda', 'Rsa', 'South Africa', 'Norge',
       'norway', 'Denmark', 'Netherlands', 'Italy', 'Spain', 'UK',
       'Somali', 'Nigeria', 'Germany', 'France'], dtype=object)

In [1175]:
df['prevEducation'].unique()

array(['Masters', 'Diploma', 'HighSchool', 'High School', 'Bachelors',
       'Barrrchelors', 'diploma', 'DIPLOMA', 'Diplomaaa', 'Doctorate'],
      dtype=object)

#### Use replace to replace calues that are same in meaning

In [1176]:
df = df.replace({"gender": {"Male": "M", "Female": "F","male": "M", "female": "F"},
                 "prevEducation": {"Barrrchelors": "Bachelors", "diploma": "Diploma","Diplomaaa": "Diploma", "High School": "HighSchool","DIPLOMA": "Diploma"}})

In [1177]:
df['prevEducation'].unique()

array(['Masters', 'Diploma', 'HighSchool', 'Bachelors', 'Doctorate'],
      dtype=object)

In [1178]:
df['gender'].unique()

array(['F', 'M'], dtype=object)

In [1179]:
df["residence"] = df["residence"].replace({
    "BI Residence": "BI_Residence",
    "BI-Residence": "BI_Residence",
    "BIResidence": "BI_Residence"
})

In [1180]:
df["residence"] 

0       Private
1       Private
2     Sognsvann
3     Sognsvann
4     Sognsvann
        ...    
72      Private
73    Sognsvann
74    Sognsvann
75    Sognsvann
76    Sognsvann
Name: residence, Length: 77, dtype: object

### 3.Handle duplicates

#### Use duplicated().sum() to know number of duplicated rows

In [1181]:
df.duplicated().sum()

np.int64(0)

#### Use drop_duplicates() to delete duplicated rows

In [1182]:
df.drop_duplicates()

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB
0,Christina,Binger,44,F,Norway,Private,72,Masters,158,59.0,55
1,Alex,Walekhwa,60,M,Kenya,Private,79,Diploma,150,60.0,75
2,Philip,Leo,25,M,Uganda,Sognsvann,55,HighSchool,130,74.0,50
3,Shoni,Hlongwane,22,F,Rsa,Sognsvann,40,HighSchool,120,,44
4,Maria,Kedibone,23,F,South Africa,Sognsvann,65,HighSchool,122,91.0,80
...,...,...,...,...,...,...,...,...,...,...,...
72,Clara,Bernard,43,F,France,Private,80,Bachelors,150,75.0,43
73,Julian,Nielsen,31,M,Denmark,Sognsvann,90,Masters,158,84.0,83
74,Sophie,Brown,33,F,UK,Sognsvann,96,Masters,158,85.0,90
75,Leon,Bauer,35,M,Germany,Sognsvann,90,Masters,160,87.0,74


In [1183]:
df.duplicated().sum()

np.int64(0)

# ____________________________________________________________________________________________________________________________________________________________________

## Part 2 – Missing Data

### 1.Identify missing values

#### Use isnull().sum() to know number of null values in all columns

In [1184]:
df.isnull().sum()

fNAME            0
lNAME            0
Age              0
gender           0
country          0
residence        0
entryEXAM        0
prevEducation    0
studyHOURS       0
Python           2
DB               0
dtype: int64

#### Question: Which columns are most affected by missing values?

#### Answer: Python Column

### 3.Impute missing values

In [1185]:
df['Python'].unique()

array([59., 60., 74., nan, 91., 88., 80., 85., 83., 79., 70., 75., 87.,
       76., 84., 33., 30., 61., 82., 66., 81., 31., 90., 69., 57., 86.,
       78., 48., 45., 15., 63., 72.])

In [1186]:
df['DB'].unique()

array([ 55,  75,  50,  44,  80,  59,  91,  60,  89,  90,  58,  99,  76,
        77,  82,  78,  73,  30,  56,  65,  83,  88,  69,  79,  45,  42,
        70,  74,  33,  92,  86,  62,  47,  67, 100,  61,  46,  68,  71,
        43])

In [1187]:
df["Python"].mean()

np.float64(75.85333333333334)

In [1188]:
df["Python"].median()

np.float64(81.0)

#### Using isnull() to know where are null value in rows

In [1189]:
df[df.isnull().any(axis=1)]

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB
3,Shoni,Hlongwane,22,F,Rsa,Sognsvann,40,HighSchool,120,,44
33,Frank,Abrahamsen,23,M,Norway,BI_Residence,68,HighSchool,152,,70


#### Use .fillna to replace these null value by mean or median

In [1190]:
try_1 = df["Python"].fillna(df["Python"].mean())
try_1

0     59.000000
1     60.000000
2     74.000000
3     75.853333
4     91.000000
        ...    
72    75.000000
73    84.000000
74    85.000000
75    87.000000
76    72.000000
Name: Python, Length: 77, dtype: float64

In [1191]:
try_2 = df["Python"].fillna(df["Python"].median())
try_2

0     59.0
1     60.0
2     74.0
3     81.0
4     91.0
      ... 
72    75.0
73    84.0
74    85.0
75    87.0
76    72.0
Name: Python, Length: 77, dtype: float64

In [1192]:
try_2.mean()

np.float64(75.98701298701299)

In [1193]:
try_1.mean()

np.float64(75.85333333333334)

#### I will assume that there are high number of outliers so i will replace non value with median 

In [1194]:
df["Python"] = df["Python"].fillna(df["Python"].median())
df["Python"]

0     59.0
1     60.0
2     74.0
3     81.0
4     91.0
      ... 
72    75.0
73    84.0
74    85.0
75    87.0
76    72.0
Name: Python, Length: 77, dtype: float64

In [1195]:
df[df.isnull().any(axis=1)]

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB


# _______________________________________________________________________________________________________________________________________________________________________________-

## Part 3 – Outliers

### 1.Detect outliers

#### Use .describe() to show mean, median, min, max and quartiles

In [1196]:
df['Python'].describe()

count    77.000000
mean     75.987013
std      15.228517
min      15.000000
25%      72.000000
50%      81.000000
75%      85.000000
max      91.000000
Name: Python, dtype: float64

In [1197]:
df['studyHOURS'].describe()

count     77.000000
mean     149.714286
std       12.743272
min      114.000000
25%      144.000000
50%      156.000000
75%      158.000000
max      160.000000
Name: studyHOURS, dtype: float64

In [1198]:
df['DB'].describe()

count     77.000000
mean      69.467532
std       17.033701
min       30.000000
25%       56.000000
50%       71.000000
75%       83.000000
max      100.000000
Name: DB, dtype: float64

#### Question: Which values in studyHOURS, Python, or DB look unrealistic?

#### We will know after applying IQR  

### 2.Handle outliers


#### Based on IQR  DB doesn't have any outliers

In [1199]:
Q1 = df['DB'].quantile(0.25)
Q3 = df['DB'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_new=df[(df['DB'] >= lower_bound) & (df['DB'] <= upper_bound)]

In [1200]:
lower_bound

np.float64(15.5)

In [1201]:
upper_bound

np.float64(123.5)

In [1202]:
df_new['DB'].describe()

count     77.000000
mean      69.467532
std       17.033701
min       30.000000
25%       56.000000
50%       71.000000
75%       83.000000
max      100.000000
Name: DB, dtype: float64

In [1203]:
df_new.describe()

Unnamed: 0,Age,entryEXAM,studyHOURS,Python,DB
count,77.0,77.0,77.0,77.0,77.0
mean,35.207792,76.753247,149.714286,75.987013,69.467532
std,10.341966,16.475784,12.743272,15.228517,17.033701
min,21.0,28.0,114.0,15.0,30.0
25%,27.0,69.0,144.0,72.0,56.0
50%,33.0,80.0,156.0,81.0,71.0
75%,42.0,90.0,158.0,85.0,83.0
max,71.0,98.0,160.0,91.0,100.0


In [1204]:
df.describe()

Unnamed: 0,Age,entryEXAM,studyHOURS,Python,DB
count,77.0,77.0,77.0,77.0,77.0
mean,35.207792,76.753247,149.714286,75.987013,69.467532
std,10.341966,16.475784,12.743272,15.228517,17.033701
min,21.0,28.0,114.0,15.0,30.0
25%,27.0,69.0,144.0,72.0,56.0
50%,33.0,80.0,156.0,81.0,71.0
75%,42.0,90.0,158.0,85.0,83.0
max,71.0,98.0,160.0,91.0,100.0


### I will do 2 senarios To treat data

#### First Senario: Remove outliers from Python column 

In [1205]:
Q1 = df_new['Python'].quantile(0.25)
Q3 = df_new['Python'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_new_2=df_new[(df_new['Python'] >= lower_bound) & (df_new['Python'] <= upper_bound)]

In [1206]:
lower_bound

np.float64(52.5)

In [1207]:
upper_bound

np.float64(104.5)

In [1208]:
df_new_2['Python'].describe()

count    71.000000
mean     79.563380
std       8.674647
min      57.000000
25%      77.000000
50%      81.000000
75%      85.000000
max      91.000000
Name: Python, dtype: float64

In [1209]:
df_new_2.to_csv("cleaned_dataset_Senairo_1.csv", index=False)

#### Second Senario: Replace all outliers with Median Values

In [1210]:
Q1 = df_new['Python'].quantile(0.25)
Q3 = df_new['Python'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

median_value = df_new['Python'].median()


In [1211]:
df_new['Python']

0     59.0
1     60.0
2     74.0
3     81.0
4     91.0
      ... 
72    75.0
73    84.0
74    85.0
75    87.0
76    72.0
Name: Python, Length: 77, dtype: float64

In [1212]:
lower_bound

np.float64(52.5)

In [1213]:
upper_bound

np.float64(104.5)

In [1214]:
df_new.loc[df['Python'] < lower_bound, 'Python'] = median_value
df_new.loc[df['Python'] > upper_bound, 'Python'] = median_value

In [1215]:
df_new.describe()

Unnamed: 0,Age,entryEXAM,studyHOURS,Python,DB
count,77.0,77.0,77.0,77.0,77.0
mean,35.207792,76.753247,149.714286,79.675325,69.467532
std,10.341966,16.475784,12.743272,8.334206,17.033701
min,21.0,28.0,114.0,57.0,30.0
25%,27.0,69.0,144.0,78.0,56.0
50%,33.0,80.0,156.0,81.0,71.0
75%,42.0,90.0,158.0,85.0,83.0
max,71.0,98.0,160.0,91.0,100.0


In [1216]:
df_new.to_csv("cleaned_dataset_Senairo_2.csv", index=False)

### What inconsistencies you found and how you fixed them

####  I did not find major inconsistencies in the dataset. The only issue was with some category values such as “diplomma” and “DIPLOMA”, which I standardized to “Diploma”.

### How missing values were imputed and why

#### The missing values were imputed using both the mean and the median for comparison. In the end, I chose the median because there were outliers in one column (Python), and the median is more robust against outliers than the mean.

### How outliers were detected and treated

#### The outliers were detected using the IQR method. I tested two different scenarios to decide on the most appropriate treatment. I found that scenario 2 was the best option because it did not reduce the dataset size and it produced more logical values compared to the other scenario.

# Task Level 2 

## Part 4 – Feature Engineering

### Create a new feature: Programming Average = (Python + DB)/2.



In [1217]:
df_new["Programming Average"] = (df["Python"] + df["DB"]) / 2
df_new.head()

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB,Programming Average
0,Christina,Binger,44,F,Norway,Private,72,Masters,158,59.0,55,57.0
1,Alex,Walekhwa,60,M,Kenya,Private,79,Diploma,150,60.0,75,67.5
2,Philip,Leo,25,M,Uganda,Sognsvann,55,HighSchool,130,74.0,50,62.0
3,Shoni,Hlongwane,22,F,Rsa,Sognsvann,40,HighSchool,120,81.0,44,62.5
4,Maria,Kedibone,23,F,South Africa,Sognsvann,65,HighSchool,122,91.0,80,85.5


### Create a binary feature: isAdult = 1 if Age >= 25, else 0.






In [1218]:
df_new["isAdult"] = (df["Age"] >= 25).astype(int)
df_new.head(7)

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB,Programming Average,isAdult
0,Christina,Binger,44,F,Norway,Private,72,Masters,158,59.0,55,57.0,1
1,Alex,Walekhwa,60,M,Kenya,Private,79,Diploma,150,60.0,75,67.5,1
2,Philip,Leo,25,M,Uganda,Sognsvann,55,HighSchool,130,74.0,50,62.0,1
3,Shoni,Hlongwane,22,F,Rsa,Sognsvann,40,HighSchool,120,81.0,44,62.5,0
4,Maria,Kedibone,23,F,South Africa,Sognsvann,65,HighSchool,122,91.0,80,85.5,0
5,Hannah,Hansen,25,F,Norge,BI_Residence,66,HighSchool,130,88.0,59,73.5,1
6,Ole,Johansen,27,M,Norway,BI_Residence,90,Bachelors,156,80.0,91,85.5,1


### Transform studyHOURS into categories (Low / Medium / High).

In [1219]:
df_new['studyHOURS'].describe()

count     77.000000
mean     149.714286
std       12.743272
min      114.000000
25%      144.000000
50%      156.000000
75%      158.000000
max      160.000000
Name: studyHOURS, dtype: float64

In [1220]:
df_new["studyHOURS"] = pd.cut(
    df["studyHOURS"],
    bins=[110, 130, 145, 170],     
    labels=["Low", "Medium", "High"]
)

df_new.head(10)

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB,Programming Average,isAdult
0,Christina,Binger,44,F,Norway,Private,72,Masters,High,59.0,55,57.0,1
1,Alex,Walekhwa,60,M,Kenya,Private,79,Diploma,High,60.0,75,67.5,1
2,Philip,Leo,25,M,Uganda,Sognsvann,55,HighSchool,Low,74.0,50,62.0,1
3,Shoni,Hlongwane,22,F,Rsa,Sognsvann,40,HighSchool,Low,81.0,44,62.5,0
4,Maria,Kedibone,23,F,South Africa,Sognsvann,65,HighSchool,Low,91.0,80,85.5,0
5,Hannah,Hansen,25,F,Norge,BI_Residence,66,HighSchool,Low,88.0,59,73.5,1
6,Ole,Johansen,27,M,Norway,BI_Residence,90,Bachelors,High,80.0,91,85.5,1
7,Lars,Olsen,29,M,norway,BI_Residence,89,Bachelors,High,85.0,60,72.5,1
8,Bjørn,Larsen,31,M,Norway,BI_Residence,88,Bachelors,High,80.0,89,84.5,1
9,Sofie,Jensen,33,F,Denmark,BI_Residence,85,Bachelors,High,83.0,90,86.5,1


### Which engineered feature do you think would add the most predictive power to the model?

#### I think The First one is the best beacuse it is continous and uses two features which are DB and Python 

## Part 5 – Feature Scaling

### Detect Numeric Columns

In [1221]:
df_new[["Age", "entryEXAM", "Python", "DB","Programming Average"]].head(5)

Unnamed: 0,Age,entryEXAM,Python,DB,Programming Average
0,44,72,59.0,55,57.0
1,60,79,60.0,75,67.5
2,25,55,74.0,50,62.0
3,22,40,81.0,44,62.5
4,23,65,91.0,80,85.5


In [1222]:
df_new[df_new["Programming Average"]<66]

Unnamed: 0,fNAME,lNAME,Age,gender,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB,Programming Average,isAdult
0,Christina,Binger,44,F,Norway,Private,72,Masters,High,59.0,55,57.0,1
2,Philip,Leo,25,M,Uganda,Sognsvann,55,HighSchool,Low,74.0,50,62.0,1
3,Shoni,Hlongwane,22,F,Rsa,Sognsvann,40,HighSchool,Low,81.0,44,62.5,0
20,Prof,Birkeland,22,M,Norway,BI_Residence,45,HighSchool,Low,81.0,30,31.5,0
21,Hanna,Isaksen,24,F,Norway,BI_Residence,50,HighSchool,Low,81.0,50,40.0,0
22,Kristine,Berg,26,F,Norway,BI_Residence,55,HighSchool,Medium,61.0,56,58.5,1
27,Aisha,Nakaweesi,37,F,Uganda,Private,55,Diploma,Medium,66.0,58,62.0,1
31,Don,Pettersen,46,M,Norway,Private,76,Diploma,High,85.0,45,65.0,1
32,Perry,Rønning,71,F,Norway,Private,30,Doctorate,Low,81.0,42,36.5,1
38,Simen,Myhre,33,F,Norway,BI_Residence,50,Diploma,Medium,69.0,33,51.0,1


### Option 1: StandardScaler (mean=0, std=1) → good for SVM, Logistic Regression.

In [1223]:
from sklearn.preprocessing import StandardScaler

numeric_cols = ["Age", "entryEXAM", "Python", "DB","Programming Average"]

scaler = StandardScaler()
df_new[numeric_cols] = scaler.fit_transform(df_new[numeric_cols])
print(df_new.head())


       fNAME      lNAME       Age gender       country  residence  entryEXAM  \
0  Christina     Binger  0.855723      F        Norway    Private  -0.290391   
1       Alex   Walekhwa  2.412963      M         Kenya    Private   0.137261   
2     Philip        Leo -0.993499      M        Uganda  Sognsvann  -1.328974   
3      Shoni  Hlongwane -1.285481      F           Rsa  Sognsvann  -2.245371   
4      Maria   Kedibone -1.188154      F  South Africa  Sognsvann  -0.718043   

  prevEducation studyHOURS    Python        DB  Programming Average  isAdult  
0       Masters       High -2.497047 -0.854917            -1.157643        1  
1       Diploma       High -2.376273  0.326925            -0.384766        1  
2    HighSchool        Low -0.685433 -1.150378            -0.789606        1  
3    HighSchool        Low  0.159987 -1.504930            -0.752803        0  
4    HighSchool        Low  1.367729  0.622386             0.940167        0  


##  Part 6 – Encoding Categorical Data

### Detect Categorical Columns

In [1224]:
df_new[["studyHOURS",'gender',"country","prevEducation",'isAdult','residence']].head()

Unnamed: 0,studyHOURS,gender,country,prevEducation,isAdult,residence
0,High,F,Norway,Masters,1,Private
1,High,M,Kenya,Diploma,1,Private
2,Low,M,Uganda,HighSchool,1,Sognsvann
3,Low,F,Rsa,HighSchool,0,Sognsvann
4,Low,F,South Africa,HighSchool,0,Sognsvann


### Handle Encoding


#### We dont need to encode isAdult column because it is 0 and 1 but other columns must be encoded

In [1225]:
df_new = pd.get_dummies(df_new, columns=["gender"], drop_first=True)
df_new

Unnamed: 0,fNAME,lNAME,Age,country,residence,entryEXAM,prevEducation,studyHOURS,Python,DB,Programming Average,isAdult,gender_M
0,Christina,Binger,0.855723,Norway,Private,-0.290391,Masters,High,-2.497047,-0.854917,-1.157643,1,False
1,Alex,Walekhwa,2.412963,Kenya,Private,0.137261,Diploma,High,-2.376273,0.326925,-0.384766,1,True
2,Philip,Leo,-0.993499,Uganda,Sognsvann,-1.328974,HighSchool,Low,-0.685433,-1.150378,-0.789606,1,True
3,Shoni,Hlongwane,-1.285481,Rsa,Sognsvann,-2.245371,HighSchool,Low,0.159987,-1.504930,-0.752803,0,False
4,Maria,Kedibone,-1.188154,South Africa,Sognsvann,-0.718043,HighSchool,Low,1.367729,0.622386,0.940167,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Clara,Bernard,0.758396,France,Private,0.198354,Bachelors,High,-0.564659,-1.564023,-1.010428,1,False
73,Julian,Nielsen,-0.409534,Denmark,Sognsvann,0.809286,Masters,High,0.522309,0.799662,0.792952,1,True
74,Sophie,Brown,-0.214879,UK,Sognsvann,1.175845,Masters,High,0.643084,1.213307,1.087381,1,False
75,Leon,Bauer,-0.020224,Germany,Sognsvann,0.809286,Masters,High,0.884632,0.267833,0.572130,1,True


In [1226]:
print(df_new["studyHOURS"].unique())


['High', 'Low', 'Medium']
Categories (3, object): ['Low' < 'Medium' < 'High']


In [1227]:
df_new["studyHOURS"] = df_new["studyHOURS"].map({ "Low": 0, "Medium": 1, "High": 2 })
df_new["studyHOURS"].head(40)

0     2
1     2
2     0
3     0
4     0
5     0
6     2
7     2
8     2
9     2
10    2
11    2
12    2
13    1
14    2
15    2
16    2
17    2
18    2
19    2
20    0
21    0
22    1
23    2
24    2
25    2
26    2
27    1
28    2
29    2
30    2
31    2
32    0
33    2
34    2
35    2
36    2
37    2
38    1
39    1
Name: studyHOURS, dtype: category
Categories (3, int64): [0 < 1 < 2]

In [1228]:
print(df_new["country"].unique())

['Norway' 'Kenya' 'Uganda' 'Rsa' 'South Africa' 'Norge' 'norway' 'Denmark'
 'Netherlands' 'Italy' 'Spain' 'UK' 'Somali' 'Nigeria' 'Germany' 'France']


In [1229]:
df_new = pd.get_dummies(df_new, columns=["country"], drop_first=True)


In [1230]:
df_new

Unnamed: 0,fNAME,lNAME,Age,residence,entryEXAM,prevEducation,studyHOURS,Python,DB,Programming Average,...,country_Nigeria,country_Norge,country_Norway,country_Rsa,country_Somali,country_South Africa,country_Spain,country_UK,country_Uganda,country_norway
0,Christina,Binger,0.855723,Private,-0.290391,Masters,2,-2.497047,-0.854917,-1.157643,...,False,False,True,False,False,False,False,False,False,False
1,Alex,Walekhwa,2.412963,Private,0.137261,Diploma,2,-2.376273,0.326925,-0.384766,...,False,False,False,False,False,False,False,False,False,False
2,Philip,Leo,-0.993499,Sognsvann,-1.328974,HighSchool,0,-0.685433,-1.150378,-0.789606,...,False,False,False,False,False,False,False,False,True,False
3,Shoni,Hlongwane,-1.285481,Sognsvann,-2.245371,HighSchool,0,0.159987,-1.504930,-0.752803,...,False,False,False,True,False,False,False,False,False,False
4,Maria,Kedibone,-1.188154,Sognsvann,-0.718043,HighSchool,0,1.367729,0.622386,0.940167,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Clara,Bernard,0.758396,Private,0.198354,Bachelors,2,-0.564659,-1.564023,-1.010428,...,False,False,False,False,False,False,False,False,False,False
73,Julian,Nielsen,-0.409534,Sognsvann,0.809286,Masters,2,0.522309,0.799662,0.792952,...,False,False,False,False,False,False,False,False,False,False
74,Sophie,Brown,-0.214879,Sognsvann,1.175845,Masters,2,0.643084,1.213307,1.087381,...,False,False,False,False,False,False,False,True,False,False
75,Leon,Bauer,-0.020224,Sognsvann,0.809286,Masters,2,0.884632,0.267833,0.572130,...,False,False,False,False,False,False,False,False,False,False


In [1231]:
print(df_new["prevEducation"].unique())


['Masters' 'Diploma' 'HighSchool' 'Bachelors' 'Doctorate']


In [1232]:
mapping = { "HighSchool": 0, "Diploma": 1, "Bachelors": 2, "Masters": 3, "Doctorate": 4}

df_new["prevEducation"] = df_new["prevEducation"].map(mapping)
df_new["prevEducation"].head(10)

0    3
1    1
2    0
3    0
4    0
5    0
6    2
7    2
8    2
9    2
Name: prevEducation, dtype: int64

In [1233]:
print(df_new["residence"].unique())


['Private' 'Sognsvann' 'BI_Residence']


In [1234]:
mapping = {"Private": 0, "Sognsvann": 1, "BI_Residence": 2}
df_new["residence"] = df_new["residence"].map(mapping)

In [1235]:
df_new["residence"].head(20)

0     0
1     0
2     1
3     1
4     1
5     2
6     2
7     2
8     2
9     2
10    2
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
Name: residence, dtype: int64

In [1236]:
df_new = df_new.replace({True: 1, False: 0})

  df_new = df_new.replace({True: 1, False: 0})


### I will remove fNAME and lNAME columns because they doesn't be necessary for the Model 

In [1237]:
df_new = df_new.drop(["fNAME", "lNAME"], axis=1)

In [1238]:
df_new

Unnamed: 0,Age,residence,entryEXAM,prevEducation,studyHOURS,Python,DB,Programming Average,isAdult,gender_M,...,country_Nigeria,country_Norge,country_Norway,country_Rsa,country_Somali,country_South Africa,country_Spain,country_UK,country_Uganda,country_norway
0,0.855723,0,-0.290391,3,2,-2.497047,-0.854917,-1.157643,1,0,...,0,0,1,0,0,0,0,0,0,0
1,2.412963,0,0.137261,1,2,-2.376273,0.326925,-0.384766,1,1,...,0,0,0,0,0,0,0,0,0,0
2,-0.993499,1,-1.328974,0,0,-0.685433,-1.150378,-0.789606,1,1,...,0,0,0,0,0,0,0,0,1,0
3,-1.285481,1,-2.245371,0,0,0.159987,-1.504930,-0.752803,0,0,...,0,0,0,1,0,0,0,0,0,0
4,-1.188154,1,-0.718043,0,0,1.367729,0.622386,0.940167,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0.758396,0,0.198354,2,2,-0.564659,-1.564023,-1.010428,1,0,...,0,0,0,0,0,0,0,0,0,0
73,-0.409534,1,0.809286,3,2,0.522309,0.799662,0.792952,1,1,...,0,0,0,0,0,0,0,0,0,0
74,-0.214879,1,1.175845,3,2,0.643084,1.213307,1.087381,1,0,...,0,0,0,0,0,0,0,1,0,0
75,-0.020224,1,0.809286,3,2,0.884632,0.267833,0.572130,1,1,...,0,0,0,0,0,0,0,0,0,0


In [1239]:
df_new.to_csv("cleaned_dataset_Senairo_2.csv", index=False)