### Predicting school dropout

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
# Load the dataset
data = pd.read_csv('students_data.csv', low_memory=False)

In [3]:
data

Unnamed: 0,_id,FullName,Regno,Gender,disable,ProfilePic,Class,__v,createdAt,updatedAt,...,BirthOrder,Dob,InterpBehaviour,LightingEnergy,Residence,Hazard,GreenSpaces,Transport,GuardianName,GuardianJob
0,65f17c113d8b442eca574c53,IRAKOZE NSABIYUMVA BENOIT,4.10803E+11,MALE,,,65f081f5c67b2e6822d9469a,1,2024-03-13 10:12:33.811,2024-05-30 19:43:18.662,...,,,,,,,,,,
1,65f17c113d8b442eca574c54,MUYIZERE CHANCE,1.21202E+11,FEMALE,No,,65f081f5c67b2e6822d9469a,1,2024-03-13 10:12:33.811,2024-05-30 19:43:18.769,...,,,,,,,,,,
2,65f17c113d8b442eca574c55,NIYOGUSHIMWA Soleil,4.10803E+11,FEMALE,No,,65f081f5c67b2e6822d9469a,1,2024-03-13 10:12:33.811,2024-05-30 19:43:18.871,...,,,,,,,,,,
3,65f17c113d8b442eca574c57,ABIMANIKUNDA CARINE,4.20613E+11,FEMALE,Yes,,65f081f5c67b2e6822d9469a,1,2024-03-13 10:12:33.811,2024-05-30 19:43:18.973,...,,,,,,,,,,
4,65f17c113d8b442eca574c58,IRANZI Pacifique,4.10803E+11,MALE,,,65f081f5c67b2e6822d9469a,1,2024-03-13 10:12:33.812,2024-05-30 19:43:18.977,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87392,66fff85ad3591274d9188c34,UWUMUHISHA Denyse,202410000779,FEMALE,,,66bc608bf1578cc030b6fbf8,0,2024-10-04 14:14:50.872,2024-10-04 14:14:50.872,...,Thirdborn,2018-10-18 00:00:00,Sharp,Solar,House,none,No Access to Green Spaces,Walking,,
87393,66fff930d3591274d9188c45,UWIMANIDUHAYE Seraphine,202410000780,FEMALE,,,66bc608bf1578cc030b6fbf8,0,2024-10-04 14:18:24.578,2024-10-04 14:18:24.578,...,Secondborn,2016-04-07 00:00:00,Sharp,Solar,House,,No Access to Green Spaces,Walking,,
87394,66fffa6994f358e27eba27f5,UWINEZA Clemantine,202410000781,FEMALE,,,66bc608bf1578cc030b6fbf8,0,2024-10-04 14:23:37.711,2024-10-04 14:23:37.711,...,Thirdborn,2016-02-17 00:00:00,Sharp,Solar,House,,No Access to Green Spaces,Walking,,
87395,66fffb2bd3591274d9188c68,MUTUYIMANA Delphine,202410000782,FEMALE,,,66bc608bf1578cc030b6fbf8,0,2024-10-04 14:26:51.596,2024-10-04 14:26:51.596,...,Fourthborn,2014-01-09 00:00:00,Sharp,Solar,House,,No Access to Green Spaces,Walking,,


In [4]:
data.columns

Index(['_id', 'FullName', 'Regno', 'Gender', 'disable', 'ProfilePic', 'Class',
       '__v', 'createdAt', 'updatedAt', 'Cell', 'District', 'FatherName',
       'Isibo', 'MotherName', 'ParentTel', 'Province', 'Sector', 'Village',
       'AllergiesType', 'Disabled', 'Status', 'parentAccountCreated',
       'Disability', 'ChildStatus', 'DistanceToSchool', 'ParentDetail',
       'ParentStatus', 'TravelTime', 'DroupoutReason', 'EducationalResources',
       'MedicalConditions', 'Hobbies', 'MotherJob', 'FatherJob',
       'FinancialStatus', 'BirthOrder', 'Dob', 'InterpBehaviour',
       'LightingEnergy', 'Residence', 'Hazard', 'GreenSpaces', 'Transport',
       'GuardianName', 'GuardianJob'],
      dtype='object')

In [5]:
# Select relevant features based on our previous analysis
features = [
    'Gender', 'Disability', 'ChildStatus', 'DistanceToSchool', 'BirthOrder', 
    'MedicalConditions', 'ParentStatus', 'MotherJob', 'FatherJob', 
    'FinancialStatus', 'Residence', 'Transport', 'LightingEnergy', 'Class'
]

In [6]:
data['DroupoutReason'].unique()

array([nan, 'kubura ibikoresho by"ishuri', "Amafaranga Y'ishuri",
       'Minerval', 'Gutwita', 'Yagiye mugisirikare', 'Uburwayi ',
       'Present', "I DONT 'KNOW", 'gutwita', 'double ', 'double listing',
       'No reason ', 'minerval', 'inkweto', 'ikaramu', 'Gutwita ', 'Hh ',
       'Amakaramu'], dtype=object)

In [7]:
data.isnull().sum()

_id                         0
FullName                    3
Regno                       0
Gender                   2970
disable                 86288
ProfilePic              87397
Class                       0
__v                         0
createdAt                   0
updatedAt                   0
Cell                    23469
District                21590
FatherName              34140
Isibo                   60282
MotherName              32323
ParentTel               53967
Province                28086
Sector                  22315
Village                 24830
AllergiesType           86757
Disabled                    0
Status                      0
parentAccountCreated        0
Disability              19133
ChildStatus             27844
DistanceToSchool         5651
ParentDetail            43371
ParentStatus            26203
TravelTime               1181
DroupoutReason          87377
EducationalResources    14540
MedicalConditions       73319
Hobbies                 56210
MotherJob 

In [8]:
final_data = data[[
    'Gender', 'ChildStatus', 'DistanceToSchool', 'BirthOrder', 
    'FinancialStatus', 'Residence', 'Transport', 'LightingEnergy','DroupoutReason'
]]

In [9]:
final_data

Unnamed: 0,Gender,ChildStatus,DistanceToSchool,BirthOrder,FinancialStatus,Residence,Transport,LightingEnergy,DroupoutReason
0,MALE,,,,,,,,
1,FEMALE,,,,,,,,
2,FEMALE,,,,,,,,
3,FEMALE,,,,,,,,
4,MALE,,,,,,,,
...,...,...,...,...,...,...,...,...,...
87392,FEMALE,Both parents,3Km Km,Thirdborn,Medium,House,Walking,Solar,
87393,FEMALE,Both parents,1km Km,Secondborn,Medium,House,Walking,Solar,
87394,FEMALE,Both parents,2km Km,Thirdborn,Medium,House,Walking,Solar,
87395,FEMALE,Both parents,1km Km,Fourthborn,Medium,House,Walking,Solar,


In [10]:
final_data.columns

Index(['Gender', 'ChildStatus', 'DistanceToSchool', 'BirthOrder',
       'FinancialStatus', 'Residence', 'Transport', 'LightingEnergy',
       'DroupoutReason'],
      dtype='object')

In [11]:
final_data['Gender'].unique()

array(['MALE', 'FEMALE', 'Male', nan], dtype=object)

In [12]:
final_data['Gender'].isna().sum()

2970

In [13]:
# Replace 'Male' with 'MALE' in the 'Gender' column
final_data['Gender'] = final_data['Gender'].replace('Male', 'MALE')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['Gender'] = final_data['Gender'].replace('Male', 'MALE')


In [14]:
final_data['Gender'].unique()

array(['MALE', 'FEMALE', nan], dtype=object)

In [15]:
# Count occurrences of 'MALE'
male_count = final_data['Gender'].value_counts().get('FEMALE', 0)
print(f"Number of MALE entries: {male_count}")

Number of MALE entries: 43381


In [16]:
# Drop rows with NaN values in the 'Gender' column
final_data = final_data.dropna(subset=['Gender'])


In [17]:
final_data['Gender'].unique()

array(['MALE', 'FEMALE'], dtype=object)

In [18]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84427 entries, 0 to 87396
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Gender            84427 non-null  object
 1   ChildStatus       59230 non-null  object
 2   DistanceToSchool  79008 non-null  object
 3   BirthOrder        28682 non-null  object
 4   FinancialStatus   70099 non-null  object
 5   Residence         32446 non-null  object
 6   Transport         31524 non-null  object
 7   LightingEnergy    28401 non-null  object
 8   DroupoutReason    20 non-null     object
dtypes: object(9)
memory usage: 6.4+ MB


In [19]:
#final_data['Disability'].unique()

In [20]:
final_data['ChildStatus'].unique()

array([nan, 'Orphan', 'Both parents', 'One parent'], dtype=object)

In [21]:
# Count occurrences of 'MALE'
male_count = final_data['ChildStatus'].value_counts().get('One parent', 0)
print(f"Number of MALE entries: {male_count}")

Number of MALE entries: 4283


In [22]:
# Replace NaN values in the 'ChildStatus' column with 'Orphan'
final_data['ChildStatus'] = final_data['ChildStatus'].fillna('Orphan')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['ChildStatus'] = final_data['ChildStatus'].fillna('Orphan')


In [23]:
final_data['ChildStatus'].unique()

array(['Orphan', 'Both parents', 'One parent'], dtype=object)

In [24]:
final_data['DistanceToSchool'].unique()

array([nan, '2', '5Km', ..., '119 Km', '148 Km', '174 Km'], dtype=object)

In [25]:
# Remove 'Km' and convert to integers, handling NaN
final_data['DistanceToSchool'] = final_data['DistanceToSchool'].str.replace('Km', '', regex=False)
final_data['DistanceToSchool'] = pd.to_numeric(final_data['DistanceToSchool'], errors='coerce')  # Convert to numeric
final_data['DistanceToSchool'] = final_data['DistanceToSchool'].fillna(0).astype(int)  # Replace NaN with 0 and convert to int


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['DistanceToSchool'] = final_data['DistanceToSchool'].str.replace('Km', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['DistanceToSchool'] = pd.to_numeric(final_data['DistanceToSchool'], errors='coerce')  # Convert to numeric
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [26]:
final_data['DistanceToSchool'].unique()

array([          0,           2,           5,           8,          12,
                40,          45,         322,          33,         200,
               125,         238,          67,           4,           1,
                10,           3,        5000,           6,         400,
               118,        3000,        6000,          50,         300,
                30,          60,           7,          14,           9,
                82,          70,         800,          20,         101,
               107,         157,          58,         103,          80,
                26,          52,         128,          53,         104,
               135,          74,         121,         145,          42,
              2900,         950,         500,          38,          56,
                23,          89,          16,          19,          15,
               150,         246,          13,         345,         987,
               234,         100,          93,         250,      

In [27]:
# Count the occurrences of the number 2 in the 'DistanceToSchool' column
count_of_2 = final_data['DistanceToSchool'].value_counts().get(2, 0)
print(f"Number of entries with value 2: {count_of_2}")


Number of entries with value 2: 3855


In [28]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84427 entries, 0 to 87396
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Gender            84427 non-null  object
 1   ChildStatus       84427 non-null  object
 2   DistanceToSchool  84427 non-null  int32 
 3   BirthOrder        28682 non-null  object
 4   FinancialStatus   70099 non-null  object
 5   Residence         32446 non-null  object
 6   Transport         31524 non-null  object
 7   LightingEnergy    28401 non-null  object
 8   DroupoutReason    20 non-null     object
dtypes: int32(1), object(8)
memory usage: 6.1+ MB


In [29]:
# Count the number of entries greater than 400 in the 'DistanceToSchool' column
count_greater_400 = (final_data['DistanceToSchool'] > 1000).sum()
print(f"Number of entries greater than 400: {count_greater_400}")


Number of entries greater than 400: 117


In [30]:
# Count the occurrences of the number 2 in the 'DistanceToSchool' column
count_of_2 = final_data['DistanceToSchool'].value_counts().get(5, 0)
print(f"Number of entries with value 2: {count_of_2}")


Number of entries with value 2: 259


In [31]:
final_data['DistanceToSchool'].unique()

array([          0,           2,           5,           8,          12,
                40,          45,         322,          33,         200,
               125,         238,          67,           4,           1,
                10,           3,        5000,           6,         400,
               118,        3000,        6000,          50,         300,
                30,          60,           7,          14,           9,
                82,          70,         800,          20,         101,
               107,         157,          58,         103,          80,
                26,          52,         128,          53,         104,
               135,          74,         121,         145,          42,
              2900,         950,         500,          38,          56,
                23,          89,          16,          19,          15,
               150,         246,          13,         345,         987,
               234,         100,          93,         250,      

In [32]:
# Replace values greater than 500 with 5 in the 'DistanceToSchool' column
final_data['DistanceToSchool'] = final_data['DistanceToSchool'].apply(lambda x: 5 if x > 500 else x)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['DistanceToSchool'] = final_data['DistanceToSchool'].apply(lambda x: 5 if x > 500 else x)


In [33]:
# Count the number of entries greater than 400 in the 'DistanceToSchool' column
count_greater_400 = (final_data['DistanceToSchool'] > 200).sum()
print(f"Number of entries greater than 400: {count_greater_400}")


Number of entries greater than 400: 87


In [34]:
final_data['BirthOrder'].unique()

array([nan, 'Fourthborn', 'Thirdborn', 'Secondborn', 'Firstborn',
       'Fifthborn', 'Sixthborn', 'Seventhborn', 'Eighthborn', 'Others',
       'Ninthborn', 'Twelfthborn', 'Eleventhborn', 'Tenthborn'],
      dtype=object)

In [35]:
# Count the occurrences of the number 2 in the 'DistanceToSchool' column
count_of_2 = final_data['BirthOrder'].value_counts().get(5, 0)
print(f"Number of entries with value 2: {count_of_2}")


Number of entries with value 2: 634


  count_of_2 = final_data['BirthOrder'].value_counts().get(5, 0)


In [36]:
# Replace NaN values in the 'BirthOrder' column with 'Unknown'
final_data['BirthOrder'] = final_data['BirthOrder'].fillna('Secondborn')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['BirthOrder'] = final_data['BirthOrder'].fillna('Secondborn')


In [37]:
final_data['BirthOrder'].unique()

array(['Secondborn', 'Fourthborn', 'Thirdborn', 'Firstborn', 'Fifthborn',
       'Sixthborn', 'Seventhborn', 'Eighthborn', 'Others', 'Ninthborn',
       'Twelfthborn', 'Eleventhborn', 'Tenthborn'], dtype=object)

In [38]:
#final_data['MedicalConditions'].unique()

In [39]:
#final_data['ParentStatus'].unique()

In [40]:
#final_data['MotherJob'].unique()

In [41]:
# Count NaN values in the 'BirthOrder' column
#nan_count = final_data['MotherJob'].isna().sum()
#print(f"Number of NaN values in 'BirthOrder' column: {nan_count}")


In [42]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84427 entries, 0 to 87396
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Gender            84427 non-null  object
 1   ChildStatus       84427 non-null  object
 2   DistanceToSchool  84427 non-null  int64 
 3   BirthOrder        84427 non-null  object
 4   FinancialStatus   70099 non-null  object
 5   Residence         32446 non-null  object
 6   Transport         31524 non-null  object
 7   LightingEnergy    28401 non-null  object
 8   DroupoutReason    20 non-null     object
dtypes: int64(1), object(8)
memory usage: 6.4+ MB


In [43]:
#final_data['FatherJob'].unique()

In [44]:
# Count NaN values in the 'BirthOrder' column
#nan_count = final_data['FatherJob'].isna().sum()
#print(f"Number of NaN values in 'BirthOrder' column: {nan_count}")


In [45]:
final_data['FinancialStatus'].unique()

array([nan, 'false', 'Rich', 'Poverty', 'Medium'], dtype=object)

In [46]:
# Replace NaN with 'Poverty' and 'false' with 'Medium' in the 'FinancialStatus' column
final_data['FinancialStatus'] = final_data['FinancialStatus'].fillna('Poverty')
final_data['FinancialStatus'] = final_data['FinancialStatus'].replace('false', 'Medium')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['FinancialStatus'] = final_data['FinancialStatus'].fillna('Poverty')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['FinancialStatus'] = final_data['FinancialStatus'].replace('false', 'Medium')


In [47]:
final_data['FinancialStatus'].unique()

array(['Poverty', 'Medium', 'Rich'], dtype=object)

In [48]:
final_data['Residence'].unique()

array([nan, 'Apartment', 'House', 'Mobile Home', 'Condo', 'Mansion',
       'Ranch', 'Townhouse', 'Tiny House', 'Farmhouse', 'Duplex', 'Villa',
       'Cabin', 'Bungalow'], dtype=object)

In [49]:
# Replace NaN with 'Poverty' and 'false' with 'Medium' in the 'FinancialStatus' column
final_data['Residence'] = final_data['Residence'].fillna('Tiny House')
#final_data['Residence'] = final_data['Residence'].replace('false', 'Medium')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['Residence'] = final_data['Residence'].fillna('Tiny House')


In [50]:
final_data['Residence'].unique()

array(['Tiny House', 'Apartment', 'House', 'Mobile Home', 'Condo',
       'Mansion', 'Ranch', 'Townhouse', 'Farmhouse', 'Duplex', 'Villa',
       'Cabin', 'Bungalow'], dtype=object)

In [51]:
final_data['Transport'].unique()

array([nan, 'Car', 'Walking', 'No Transportation', 'Bicycle',
       'Public Transit', 'Scooter', 'Motorcycle', 'Electric Vehicle',
       'Carpooling', 'Telecommuting', 'No Regular Transportation'],
      dtype=object)

In [52]:
# Replace NaN with 'Poverty' and 'false' with 'Medium' in the 'FinancialStatus' column
final_data['Transport'] = final_data['Transport'].fillna('Walking')
#final_data['Residence'] = final_data['Residence'].replace('false', 'Medium')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['Transport'] = final_data['Transport'].fillna('Walking')


In [53]:
final_data['Transport'].unique()

array(['Walking', 'Car', 'No Transportation', 'Bicycle', 'Public Transit',
       'Scooter', 'Motorcycle', 'Electric Vehicle', 'Carpooling',
       'Telecommuting', 'No Regular Transportation'], dtype=object)

In [54]:
final_data['LightingEnergy'].unique()

array([nan, 'Solar', 'Electricity', 'Geothermal', 'Nuclear',
       'Hydroelectric', 'Wind', 'Wind Energy', 'Solar Energy',
       'Nuclear Power', 'Geothermal Energy', 'Hydroelectric Power'],
      dtype=object)

In [55]:
# Replace NaN with 'Poverty' and 'false' with 'Medium' in the 'FinancialStatus' column
final_data['LightingEnergy'] = final_data['LightingEnergy'].fillna('Electricity')
#final_data['Residence'] = final_data['Residence'].replace('false', 'Medium')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['LightingEnergy'] = final_data['LightingEnergy'].fillna('Electricity')


In [56]:
final_data['LightingEnergy'].unique()

array(['Electricity', 'Solar', 'Geothermal', 'Nuclear', 'Hydroelectric',
       'Wind', 'Wind Energy', 'Solar Energy', 'Nuclear Power',
       'Geothermal Energy', 'Hydroelectric Power'], dtype=object)

In [57]:
final_data['DroupoutReason'].unique()

array([nan, 'kubura ibikoresho by"ishuri', "Amafaranga Y'ishuri",
       'Minerval', 'Gutwita', 'Yagiye mugisirikare', 'Uburwayi ',
       'Present', "I DONT 'KNOW", 'gutwita', 'double ', 'double listing',
       'No reason ', 'minerval', 'inkweto', 'ikaramu', 'Gutwita ', 'Hh ',
       'Amakaramu'], dtype=object)

In [58]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84427 entries, 0 to 87396
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Gender            84427 non-null  object
 1   ChildStatus       84427 non-null  object
 2   DistanceToSchool  84427 non-null  int64 
 3   BirthOrder        84427 non-null  object
 4   FinancialStatus   84427 non-null  object
 5   Residence         84427 non-null  object
 6   Transport         84427 non-null  object
 7   LightingEnergy    84427 non-null  object
 8   DroupoutReason    20 non-null     object
dtypes: int64(1), object(8)
memory usage: 6.4+ MB


In [59]:
final_data.isnull().sum()

Gender                  0
ChildStatus             0
DistanceToSchool        0
BirthOrder              0
FinancialStatus         0
Residence               0
Transport               0
LightingEnergy          0
DroupoutReason      84407
dtype: int64

In [60]:
final_data.head()

Unnamed: 0,Gender,ChildStatus,DistanceToSchool,BirthOrder,FinancialStatus,Residence,Transport,LightingEnergy,DroupoutReason
0,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,
1,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,
2,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,
3,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,
4,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,


In [61]:
final_data['DroupoutReason'].unique()

array([nan, 'kubura ibikoresho by"ishuri', "Amafaranga Y'ishuri",
       'Minerval', 'Gutwita', 'Yagiye mugisirikare', 'Uburwayi ',
       'Present', "I DONT 'KNOW", 'gutwita', 'double ', 'double listing',
       'No reason ', 'minerval', 'inkweto', 'ikaramu', 'Gutwita ', 'Hh ',
       'Amakaramu'], dtype=object)

In [62]:
# Replace NaN values in the 'DroupoutReason' column with 'studying'
final_data['DroupoutReason'] = final_data['DroupoutReason'].fillna('studying')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['DroupoutReason'] = final_data['DroupoutReason'].fillna('studying')


In [63]:
#final_data['MedicalConditions'].unique()

In [64]:
final_data.isnull().sum()

Gender              0
ChildStatus         0
DistanceToSchool    0
BirthOrder          0
FinancialStatus     0
Residence           0
Transport           0
LightingEnergy      0
DroupoutReason      0
dtype: int64

In [65]:
final_data.head()

Unnamed: 0,Gender,ChildStatus,DistanceToSchool,BirthOrder,FinancialStatus,Residence,Transport,LightingEnergy,DroupoutReason
0,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying
1,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying
2,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying
3,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying
4,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying


In [66]:
final_data['DroupoutReason'].unique()

array(['studying', 'kubura ibikoresho by"ishuri', "Amafaranga Y'ishuri",
       'Minerval', 'Gutwita', 'Yagiye mugisirikare', 'Uburwayi ',
       'Present', "I DONT 'KNOW", 'gutwita', 'double ', 'double listing',
       'No reason ', 'minerval', 'inkweto', 'ikaramu', 'Gutwita ', 'Hh ',
       'Amakaramu'], dtype=object)

In [67]:
final_data

Unnamed: 0,Gender,ChildStatus,DistanceToSchool,BirthOrder,FinancialStatus,Residence,Transport,LightingEnergy,DroupoutReason
0,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying
1,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying
2,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying
3,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying
4,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying
...,...,...,...,...,...,...,...,...,...
87392,FEMALE,Both parents,3,Thirdborn,Medium,House,Walking,Solar,studying
87393,FEMALE,Both parents,0,Secondborn,Medium,House,Walking,Solar,studying
87394,FEMALE,Both parents,0,Thirdborn,Medium,House,Walking,Solar,studying
87395,FEMALE,Both parents,0,Fourthborn,Medium,House,Walking,Solar,studying


In [68]:
# # Count the number of ones in the 'Target' column
# ones_count = final_data['Target'].sum()
# print(f"Number of ones in 'Target': {ones_count}")


In [69]:
# # Count the number of zeros in the 'Target' column
# zeros_count = (final_data['Target'] == 0).sum()
# print(f"Number of zeros in 'Target': {zeros_count}")


### moodel

In [70]:
features=[
    'Gender', 'ChildStatus', 'DistanceToSchool', 'BirthOrder', 
    'FinancialStatus', 'Residence', 'Transport', 'LightingEnergy'
]

In [71]:
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [72]:
# Define dropout reasons
dropout_reasons = [
    'kubura ibikoresho by"ishuri', "Amafaranga Y'ishuri", 'Minerval',
    'Gutwita', 'Yagiye mugisirikare', 'Uburwayi ', 'double ', 'double listing',
    'No reason ', 'minerval', 'inkweto', 'ikaramu', 'Gutwita ', 'Amakaramu'
]

In [73]:
# Map the 'DroupoutReason' column to binary target
final_data['Target'] = final_data['DroupoutReason'].apply(lambda x: 1 if x in dropout_reasons else 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['Target'] = final_data['DroupoutReason'].apply(lambda x: 1 if x in dropout_reasons else 0)


In [74]:
final_data

Unnamed: 0,Gender,ChildStatus,DistanceToSchool,BirthOrder,FinancialStatus,Residence,Transport,LightingEnergy,DroupoutReason,Target
0,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying,0
1,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying,0
2,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying,0
3,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying,0
4,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity,studying,0
...,...,...,...,...,...,...,...,...,...,...
87392,FEMALE,Both parents,3,Thirdborn,Medium,House,Walking,Solar,studying,0
87393,FEMALE,Both parents,0,Secondborn,Medium,House,Walking,Solar,studying,0
87394,FEMALE,Both parents,0,Thirdborn,Medium,House,Walking,Solar,studying,0
87395,FEMALE,Both parents,0,Fourthborn,Medium,House,Walking,Solar,studying,0


In [75]:
final_data['Target'].unique()

array([0, 1], dtype=int64)

In [76]:
# Select only the relevant features and target
X = final_data[features]
y = final_data['Target']

In [77]:
# Count the number of zeros in the 'Target' column
zeros_count = (final_data['Target'] == 0).sum()
print(f"Number of zeros in 'Target': {zeros_count}")


Number of zeros in 'Target': 84411


In [78]:
X.columns

Index(['Gender', 'ChildStatus', 'DistanceToSchool', 'BirthOrder',
       'FinancialStatus', 'Residence', 'Transport', 'LightingEnergy'],
      dtype='object')

In [79]:
y.info()

<class 'pandas.core.series.Series'>
Index: 84427 entries, 0 to 87396
Series name: Target
Non-Null Count  Dtype
--------------  -----
84427 non-null  int64
dtypes: int64(1)
memory usage: 1.3 MB


In [80]:
X['Residence'].unique()

array(['Tiny House', 'Apartment', 'House', 'Mobile Home', 'Condo',
       'Mansion', 'Ranch', 'Townhouse', 'Farmhouse', 'Duplex', 'Villa',
       'Cabin', 'Bungalow'], dtype=object)

In [81]:
# Identify categorical features (assume object and some integer columns are categorical)
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [82]:
categorical_features

['Gender',
 'ChildStatus',
 'BirthOrder',
 'FinancialStatus',
 'Residence',
 'Transport',
 'LightingEnergy']

In [83]:
X

Unnamed: 0,Gender,ChildStatus,DistanceToSchool,BirthOrder,FinancialStatus,Residence,Transport,LightingEnergy
0,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity
1,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity
2,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity
3,FEMALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity
4,MALE,Orphan,0,Secondborn,Poverty,Tiny House,Walking,Electricity
...,...,...,...,...,...,...,...,...
87392,FEMALE,Both parents,3,Thirdborn,Medium,House,Walking,Solar
87393,FEMALE,Both parents,0,Secondborn,Medium,House,Walking,Solar
87394,FEMALE,Both parents,0,Thirdborn,Medium,House,Walking,Solar
87395,FEMALE,Both parents,0,Fourthborn,Medium,House,Walking,Solar


In [84]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
# Create a CatBoost Pool (optional but recommended for categorical features)
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)

In [86]:

# Initialize and train the CatBoost model
model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, verbose=100)
# Train using the Pool object
model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

0:	learn: 0.3520821	test: 0.3521874	best: 0.3521874 (0)	total: 219ms	remaining: 1m 49s
100:	learn: 0.0010639	test: 0.0021483	best: 0.0021483 (100)	total: 5.28s	remaining: 20.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.002117624801
bestIteration = 122

Shrink model to first 123 iterations.


<catboost.core.CatBoostClassifier at 0x1ddaf3fc190>

In [87]:
# Save the trained model to a file
model.save_model('catboost_model.bin')

print("Model saved successfully!")


Model saved successfully!


In [88]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature importance (optional)
feature_importances = model.get_feature_importance(prettified=True)
print("Feature Importances:\n", feature_importances)

Accuracy: 0.9997038967191757
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     16881
           1       0.00      0.00      0.00         5

    accuracy                           1.00     16886
   macro avg       0.50      0.50      0.50     16886
weighted avg       1.00      1.00      1.00     16886

Feature Importances:
          Feature Id  Importances
0       ChildStatus    45.274648
1   FinancialStatus    21.167631
2            Gender    10.649156
3         Residence     6.387756
4  DistanceToSchool     5.765065
5         Transport     4.348185
6        BirthOrder     3.210297
7    LightingEnergy     3.197261


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### on unseen data

In [95]:
import pandas as pd

# Example of unseen data as an array
unseen_data = [
    ['Male', 'Orphan', 5, 'Secondborn', 'Poverty', 'Apartment', 'Car', 'Electricity'],
    ['Female', 'Orphan', 10, 'Thirdborn', 'Rich', 'Townhouse', 'Walking', 'Solar'],
    ['Male', 'Both parents', 3, 'Fourthborn', 'Poverty', 'Duplex', 'Carpooling', 'Wind'],
    ['Female', 'Both parents', 7, 'Fourthborn', 'Medium', 'Villa', 'Scooter', 'Solar']
]

# Create a DataFrame using the same features as the training data
columns = ['Gender', 'ChildStatus', 'DistanceToSchool', 'BirthOrder', 
           'FinancialStatus', 'Residence', 'Transport', 'LightingEnergy']

unseen_df = pd.DataFrame(unseen_data, columns=columns)

# You can now use this `unseen_df` DataFrame to make predictions
# Assuming you already have the trained CatBoost model from previous steps
categorical_features_unseen = unseen_df.select_dtypes(include=['object']).columns.tolist()

# Create a Pool for the unseen data
unseen_pool = Pool(data=unseen_df, cat_features=categorical_features)

# Load the saved model from the file
loaded_model = CatBoostClassifier()
loaded_model.load_model('catboost_model.bin')

# Make predictions on the unseen data
y_unseen_pred = loaded_model.predict(unseen_pool)

# Print predictions
print("Predictions on Unseen Data:", y_unseen_pred)


Predictions on Unseen Data: [0 0 0 0]


In [None]:
unseen_df.info()

In [91]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84427 entries, 0 to 87396
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Gender            84427 non-null  object
 1   ChildStatus       84427 non-null  object
 2   DistanceToSchool  84427 non-null  int64 
 3   BirthOrder        84427 non-null  object
 4   FinancialStatus   84427 non-null  object
 5   Residence         84427 non-null  object
 6   Transport         84427 non-null  object
 7   LightingEnergy    84427 non-null  object
dtypes: int64(1), object(7)
memory usage: 5.8+ MB


In [98]:
import pandas as pd

# Assuming `final_data` is your DataFrame
unseen_df.to_csv('final_data.csv', index=False)

print("DataFrame saved as CSV successfully!")


DataFrame saved as CSV successfully!
