# World Marriage Dataset

In [2]:
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
import time

import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, precision_score

## 1. Loading The Dataset

In [4]:
df = pd.read_csv('World Marriage Dataset.csv', index_col = 'Sr.No.')
df.head()

Unnamed: 0_level_0,Country,AgeGroup,Sex,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Afghanistan,[15-19],Man,Divorced,Survey,1972,1974,National statistics
2,Afghanistan,[20-24],Man,Divorced,Survey,1972,1974,National statistics
3,Afghanistan,[25-29],Man,Divorced,Survey,1972,1974,National statistics
4,Afghanistan,[30-34],Man,Divorced,Survey,1972,1974,National statistics
5,Afghanistan,[35-39],Man,Divorced,Survey,1972,1974,National statistics


### 1.1 Exploring The Dataset

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 271604 entries, 1 to 271604
Data columns (total 8 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Country                       271604 non-null  object
 1   AgeGroup                      271604 non-null  object
 2   Sex                           271604 non-null  object
 3   MaritalStatus                 271604 non-null  object
 4   DataProcess                   271604 non-null  object
 5   Data Collection (Start Year)  271604 non-null  int64 
 6   Data Collection (End Year)    271604 non-null  int64 
 7   Data Source                   271604 non-null  object
dtypes: int64(2), object(6)
memory usage: 18.6+ MB


In [7]:
df.describe()

Unnamed: 0,Data Collection (Start Year),Data Collection (End Year)
count,271604.0,271604.0
mean,1996.96,1997.06
std,14.24,14.28
min,1954.0,1955.0
25%,1986.0,1986.0
50%,2000.0,2000.0
75%,2010.0,2010.0
max,2019.0,2019.0


In [8]:
df.shape

(271604, 8)

In [9]:
df.size

2172832

## 2. Data Preprocessing

### 2.1. Finding Null Values

In [12]:
df['Country'].isnull().sum()

0

In [13]:
df['AgeGroup'].isnull().sum()

0

In [14]:
df['Sex'].isnull().sum()

0

In [15]:
df['MaritalStatus'].isnull().sum()

0

In [16]:
df['DataProcess'].isnull().sum()

0

In [17]:
df['Data Collection (Start Year)'].isnull().sum()

0

In [18]:
df['Data Collection (End Year)'].isnull().sum()

0

In [19]:
df['Data Source'].isnull().sum()

0

This dataset has no null values.

## 3. Exploratory Data Analysis (EDA)

In [22]:
df.head()

Unnamed: 0_level_0,Country,AgeGroup,Sex,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Afghanistan,[15-19],Man,Divorced,Survey,1972,1974,National statistics
2,Afghanistan,[20-24],Man,Divorced,Survey,1972,1974,National statistics
3,Afghanistan,[25-29],Man,Divorced,Survey,1972,1974,National statistics
4,Afghanistan,[30-34],Man,Divorced,Survey,1972,1974,National statistics
5,Afghanistan,[35-39],Man,Divorced,Survey,1972,1974,National statistics


In [23]:
df['Country'].nunique()

235

In [24]:
df['AgeGroup'].nunique()

63

In [25]:
df['Sex'].unique()

array(['Man', 'Woman'], dtype=object)

In [26]:
df['MaritalStatus'].nunique()

35

In [27]:
df['MaritalStatus'].unique()

array(['Divorced', 'Married', 'Single', 'Widowed',
       'Divorced or Separated', 'Separated', 'Never married',
       'Not in union', 'Not living together',
       'Married or Living together', 'Widowed or divorced',
       'Living together', 'Consensual union', 'Ever married',
       'Currently not married', 'Consensual union, not living together',
       'Married or in consensual union',
       'Married or married but separated', 'Registred partnership',
       'Visiting partner', 'Widowed, divorced or separated',
       'Married, in consensual unions or separated',
       'Separated from consensual union',
       'Currently not married nor in consensual union',
       'Marriage contract', 'Divorced or Separated or Widowed',
       'Separated from marriage', 'Married gaunna not performed',
       'Married monogamous', 'Married polygamous', 'Divorced or Widowed',
       'Single or in consensual unions', 'Widowed or separated',
       'Married spouse absent', 'Married spouse present'

In [28]:
df['DataProcess'].unique()

array(['Survey', 'Census', 'Estimate', 'Multiround surv', 'Dual record',
       'GGS'], dtype=object)

In [29]:
df['Data Collection (Start Year)'].unique()

array([1972, 1979, 2007, 2010, 2013, 2015, 1989, 2000, 2001, 2002, 2005,
       2008, 2011, 2017, 1977, 1987, 1992, 2006, 2012, 1974, 1980, 1990,
       1970, 2014, 1991, 1959, 2016, 1981, 1971, 1976, 1986, 1988, 1993,
       1994, 1996, 1997, 1999, 1973, 1975, 1978, 1982, 2009, 2018, 1995,
       2004, 1961, 1998, 2003, 1960, 1985, 1984, 1965, 1983, 1963, 1966,
       1964, 1968, 1954, 1969, 1962, 2019, 1967], dtype=int64)

In [30]:
df['Data Collection (End Year)'].unique()

array([1974, 1979, 2008, 2010, 2011, 2014, 2016, 1989, 2000, 2001, 2002,
       2005, 2009, 2018, 1977, 1987, 1992, 2006, 2013, 1980, 1990, 1970,
       1991, 1959, 2017, 1981, 1971, 1976, 1986, 1988, 1993, 1994, 1996,
       1997, 1999, 1972, 1973, 1975, 1978, 1982, 2012, 2015, 1995, 2004,
       2007, 1961, 1998, 1960, 1985, 1984, 2003, 1965, 1983, 1964, 1966,
       1967, 1969, 1955, 1962, 2019], dtype=int64)

In [31]:
df['Data Source'].nunique()

15

In [32]:
df['Data Source'].unique()

array(['National statistics', 'UNSD', 'MICS', 'DHS_HH',
       'DHS_STATcompiler', 'US Census Bureau', 'MICS_HH', 'RHS',
       'PAPCHILD', 'PAPFAM', 'INED', 'GGS', 'Eurostat', 'GFHS', 'IPUMS'],
      dtype=object)

In [33]:
country_marital_counts = df.groupby(['Country', 'MaritalStatus']).size().unstack()
country_marital_counts

MaritalStatus,Consensual union,"Consensual union, not living together",Currently not married,Currently not married nor in consensual union,Divorced,Divorced or Separated,Divorced or Separated or Widowed,Divorced or Widowed,Ever married,Living together,...,Separated,Separated from consensual union,Separated from marriage,Single,Single or in consensual unions,Visiting partner,Widowed,Widowed or divorced,Widowed or separated,"Widowed, divorced or separated"
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,,,,,100.00,14.00,,,,,...,14.00,,,71.00,,,116.00,,,
Albania,,,,,141.00,22.00,,,,61.00,...,61.00,,,91.00,,,191.00,13.00,,
Algeria,,,,,129.00,31.00,,,,,...,57.00,,,129.00,,,160.00,,,
American Samoa,18.00,,,,84.00,,,,4.00,,...,58.00,,,90.00,,,86.00,,,
Angola,28.00,,,,101.00,20.00,,,,14.00,...,87.00,,,48.00,,,121.00,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wallis and Futuna Islands,,,,,77.00,67.00,,,,,...,,,,156.00,,,143.00,,,
Western Sahara,,,,,28.00,,,,,,...,28.00,,,28.00,,,28.00,,,
Yemen,,,,,204.00,,,,,,...,,,,102.00,,,205.00,,,
Zambia,20.00,,,,252.00,20.00,,,,125.00,...,101.00,,,86.00,,,272.00,,,


reverted# Count of marital status by country
marital_status_counts = df['MaritalStatus'].value_counts()

# Plot
plt.figure(figsize=(20,6))
plt.bar(marital_status_counts.index, marital_status_counts.values, color = 'blue')
plt.title('Distribution of Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.xticks(rotation=70)
plt.show()

reverted# Count of marital status by gender
#df.groupby(['Sex', 'MaritalStatus']) = groups data by sex and marital status
#size = count,i.e.,row present in each category
#unstack = converts the grouped result into a table (DataFrame), where the index is 'Sex' (gender) and each marital status becomes a separate column.
gender_marital_counts = df.groupby(['Sex', 'MaritalStatus']).size().unstack()

# Plot
gender_marital_counts.plot(kind='bar', stacked=True, figsize=(10,6))
plt.title('Gender-based Marital Status Comparison')
plt.xlabel('Gender')
plt.ylabel('Count')
#(0.9, 0.0) = This would place the legend at 90% across the x-axis and 0% up the y-axis of the figure.
plt.legend(loc=(0.9, 0.0))
plt.xticks(rotation=0)
plt.show()

reverted# Count of marital status by country
country_marital_counts = df['Country'].value_counts().nlargest(10)

# Plot
plt.figure(figsize=(10,6))
plt.bar(country_marital_counts.index, country_marital_counts.values,color = 'green')
plt.title('Top 10 Countries by Count')
plt.xlabel('Country')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

reverted# Count the occurrences of each Data Process
process_counts = df['DataProcess'].value_counts()

process_dict = process_counts.to_dict()
print('Count of each Data Process')
print('-------------------------------')
for process, count in process_dict.items():
    print('-------------------------------')
    print(process, ':',count)
print('-------------------------------')

reverted# Plotting the pie chart
plt.figure(figsize=(10,20))
#autopct - display %value of each part on chart
#1.1f indicates that the percentage should be shown with one decimal place.
#The double % is used to display the percent sign (%) after the percentage value. (25% = 25.0%)
#startangle = defines starting angle of pie chart. default is 0.
#colors is used to define one of the many color themes provided by matplotlib
plt.pie(process_counts, labels=process_counts.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title('Distribution of Data Process')
plt.show()

reverted# Count the occurrences of each Data Source
source_counts = df['Data Source'].value_counts()

source_dict = source_counts.to_dict()
print('Count of each Data Source')
print('-------------------------------')
for source, count in source_dict.items():
    print('-------------------------------')
    print(source, ':',count)
print('-------------------------------')

reverted# Plotting the pie chart
plt.figure(figsize=(10,10))
plt.pie(source_counts, labels=source_counts.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.Accent.colors)
plt.title('Distribution of Data Sources')
plt.show()

## 4. Encoding, Scaling and Feature Engineering

### 4.1. Encoding

In [43]:
df.head()

Unnamed: 0_level_0,Country,AgeGroup,Sex,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Afghanistan,[15-19],Man,Divorced,Survey,1972,1974,National statistics
2,Afghanistan,[20-24],Man,Divorced,Survey,1972,1974,National statistics
3,Afghanistan,[25-29],Man,Divorced,Survey,1972,1974,National statistics
4,Afghanistan,[30-34],Man,Divorced,Survey,1972,1974,National statistics
5,Afghanistan,[35-39],Man,Divorced,Survey,1972,1974,National statistics


In [44]:
encoded_df = pd.get_dummies(df, columns=['Sex'])
print(encoded_df)

            Country AgeGroup MaritalStatus DataProcess  \
Sr.No.                                                   
1       Afghanistan  [15-19]      Divorced      Survey   
2       Afghanistan  [20-24]      Divorced      Survey   
3       Afghanistan  [25-29]      Divorced      Survey   
4       Afghanistan  [30-34]      Divorced      Survey   
5       Afghanistan  [35-39]      Divorced      Survey   
...             ...      ...           ...         ...   
271600     Zimbabwe  [55-59]       Widowed      Survey   
271601     Zimbabwe  [60-64]       Widowed      Survey   
271602     Zimbabwe  [65-69]       Widowed      Survey   
271603     Zimbabwe  [70-74]       Widowed      Survey   
271604     Zimbabwe    [75+]       Widowed      Survey   

        Data Collection (Start Year)  Data Collection (End Year)  \
Sr.No.                                                             
1                               1972                        1974   
2                               1972     

In [45]:
frequency_encoding = encoded_df['Country'].value_counts().to_dict()
encoded_df['Country_encoded'] = encoded_df['Country'].map(frequency_encoding)
encoded_df

Unnamed: 0_level_0,Country,AgeGroup,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source,Sex_Man,Sex_Woman,Country_encoded
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Afghanistan,[15-19],Divorced,Survey,1972,1974,National statistics,True,False,499
2,Afghanistan,[20-24],Divorced,Survey,1972,1974,National statistics,True,False,499
3,Afghanistan,[25-29],Divorced,Survey,1972,1974,National statistics,True,False,499
4,Afghanistan,[30-34],Divorced,Survey,1972,1974,National statistics,True,False,499
5,Afghanistan,[35-39],Divorced,Survey,1972,1974,National statistics,True,False,499
...,...,...,...,...,...,...,...,...,...,...
271600,Zimbabwe,[55-59],Widowed,Survey,2017,2017,National statistics,False,True,1736
271601,Zimbabwe,[60-64],Widowed,Survey,2017,2017,National statistics,False,True,1736
271602,Zimbabwe,[65-69],Widowed,Survey,2017,2017,National statistics,False,True,1736
271603,Zimbabwe,[70-74],Widowed,Survey,2017,2017,National statistics,False,True,1736


In [46]:
le = LabelEncoder()
encoded_df['Status_encoded'] = le.fit_transform(encoded_df['MaritalStatus'])
encoded_df

Unnamed: 0_level_0,Country,AgeGroup,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source,Sex_Man,Sex_Woman,Country_encoded,Status_encoded
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Afghanistan,[15-19],Divorced,Survey,1972,1974,National statistics,True,False,499,4
2,Afghanistan,[20-24],Divorced,Survey,1972,1974,National statistics,True,False,499,4
3,Afghanistan,[25-29],Divorced,Survey,1972,1974,National statistics,True,False,499,4
4,Afghanistan,[30-34],Divorced,Survey,1972,1974,National statistics,True,False,499,4
5,Afghanistan,[35-39],Divorced,Survey,1972,1974,National statistics,True,False,499,4
...,...,...,...,...,...,...,...,...,...,...,...
271600,Zimbabwe,[55-59],Widowed,Survey,2017,2017,National statistics,False,True,1736,31
271601,Zimbabwe,[60-64],Widowed,Survey,2017,2017,National statistics,False,True,1736,31
271602,Zimbabwe,[65-69],Widowed,Survey,2017,2017,National statistics,False,True,1736,31
271603,Zimbabwe,[70-74],Widowed,Survey,2017,2017,National statistics,False,True,1736,31


In [47]:
le = LabelEncoder()
encoded_df['Age_encoded'] = le.fit_transform(encoded_df['AgeGroup'])
encoded_df

Unnamed: 0_level_0,Country,AgeGroup,MaritalStatus,DataProcess,Data Collection (Start Year),Data Collection (End Year),Data Source,Sex_Man,Sex_Woman,Country_encoded,Status_encoded,Age_encoded
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Afghanistan,[15-19],Divorced,Survey,1972,1974,National statistics,True,False,499,4,9
2,Afghanistan,[20-24],Divorced,Survey,1972,1974,National statistics,True,False,499,4,17
3,Afghanistan,[25-29],Divorced,Survey,1972,1974,National statistics,True,False,499,4,22
4,Afghanistan,[30-34],Divorced,Survey,1972,1974,National statistics,True,False,499,4,26
5,Afghanistan,[35-39],Divorced,Survey,1972,1974,National statistics,True,False,499,4,29
...,...,...,...,...,...,...,...,...,...,...,...,...
271600,Zimbabwe,[55-59],Widowed,Survey,2017,2017,National statistics,False,True,1736,31,47
271601,Zimbabwe,[60-64],Widowed,Survey,2017,2017,National statistics,False,True,1736,31,50
271602,Zimbabwe,[65-69],Widowed,Survey,2017,2017,National statistics,False,True,1736,31,55
271603,Zimbabwe,[70-74],Widowed,Survey,2017,2017,National statistics,False,True,1736,31,60


In [48]:
encoded_df = pd.get_dummies(encoded_df, columns=['DataProcess'])
encoded_df

Unnamed: 0_level_0,Country,AgeGroup,MaritalStatus,Data Collection (Start Year),Data Collection (End Year),Data Source,Sex_Man,Sex_Woman,Country_encoded,Status_encoded,Age_encoded,DataProcess_Census,DataProcess_Dual record,DataProcess_Estimate,DataProcess_GGS,DataProcess_Multiround surv,DataProcess_Survey
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Afghanistan,[15-19],Divorced,1972,1974,National statistics,True,False,499,4,9,False,False,False,False,False,True
2,Afghanistan,[20-24],Divorced,1972,1974,National statistics,True,False,499,4,17,False,False,False,False,False,True
3,Afghanistan,[25-29],Divorced,1972,1974,National statistics,True,False,499,4,22,False,False,False,False,False,True
4,Afghanistan,[30-34],Divorced,1972,1974,National statistics,True,False,499,4,26,False,False,False,False,False,True
5,Afghanistan,[35-39],Divorced,1972,1974,National statistics,True,False,499,4,29,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271600,Zimbabwe,[55-59],Widowed,2017,2017,National statistics,False,True,1736,31,47,False,False,False,False,False,True
271601,Zimbabwe,[60-64],Widowed,2017,2017,National statistics,False,True,1736,31,50,False,False,False,False,False,True
271602,Zimbabwe,[65-69],Widowed,2017,2017,National statistics,False,True,1736,31,55,False,False,False,False,False,True
271603,Zimbabwe,[70-74],Widowed,2017,2017,National statistics,False,True,1736,31,60,False,False,False,False,False,True


In [49]:
le = LabelEncoder()
encoded_df['source_encoded'] = le.fit_transform(encoded_df['Data Source'])
encoded_df

Unnamed: 0_level_0,Country,AgeGroup,MaritalStatus,Data Collection (Start Year),Data Collection (End Year),Data Source,Sex_Man,Sex_Woman,Country_encoded,Status_encoded,Age_encoded,DataProcess_Census,DataProcess_Dual record,DataProcess_Estimate,DataProcess_GGS,DataProcess_Multiround surv,DataProcess_Survey,source_encoded
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,Afghanistan,[15-19],Divorced,1972,1974,National statistics,True,False,499,4,9,False,False,False,False,False,True,9
2,Afghanistan,[20-24],Divorced,1972,1974,National statistics,True,False,499,4,17,False,False,False,False,False,True,9
3,Afghanistan,[25-29],Divorced,1972,1974,National statistics,True,False,499,4,22,False,False,False,False,False,True,9
4,Afghanistan,[30-34],Divorced,1972,1974,National statistics,True,False,499,4,26,False,False,False,False,False,True,9
5,Afghanistan,[35-39],Divorced,1972,1974,National statistics,True,False,499,4,29,False,False,False,False,False,True,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271600,Zimbabwe,[55-59],Widowed,2017,2017,National statistics,False,True,1736,31,47,False,False,False,False,False,True,9
271601,Zimbabwe,[60-64],Widowed,2017,2017,National statistics,False,True,1736,31,50,False,False,False,False,False,True,9
271602,Zimbabwe,[65-69],Widowed,2017,2017,National statistics,False,True,1736,31,55,False,False,False,False,False,True,9
271603,Zimbabwe,[70-74],Widowed,2017,2017,National statistics,False,True,1736,31,60,False,False,False,False,False,True,9


### 4.2. Deleting unnecessary columns

In [51]:
# List of columns you want to drop
columns_to_drop = ['Country', 'AgeGroup','MaritalStatus','Data Source']

# Dropping columns from the DataFrame
df2 = encoded_df.drop(columns=columns_to_drop)

In [52]:
df2.head()

Unnamed: 0_level_0,Data Collection (Start Year),Data Collection (End Year),Sex_Man,Sex_Woman,Country_encoded,Status_encoded,Age_encoded,DataProcess_Census,DataProcess_Dual record,DataProcess_Estimate,DataProcess_GGS,DataProcess_Multiround surv,DataProcess_Survey,source_encoded
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1972,1974,True,False,499,4,9,False,False,False,False,False,True,9
2,1972,1974,True,False,499,4,17,False,False,False,False,False,True,9
3,1972,1974,True,False,499,4,22,False,False,False,False,False,True,9
4,1972,1974,True,False,499,4,26,False,False,False,False,False,True,9
5,1972,1974,True,False,499,4,29,False,False,False,False,False,True,9


### 4.3 Dealing with outliers

In [54]:
# dataframe=df to be applied on
#col_name=column name for which the values are to be calculated
#q1=first quartile(default = 0.1 or 10percentile)
#q3=third quartile(default = 0.99 or 99percentile)

def outlier_thresholds(dataframe, col_name, q1=0.1, q3=0.99):
    
    #calculating quartiles
    quartile1 = dataframe[col_name].quantile(q1) # the value below q1
    quartile3 = dataframe[col_name].quantile(q3) #the value below q3
    
    #calculating inter quartile range
    interquantile_range = quartile3 - quartile1
    
    #calculating limit for outliers
    up_limit = quartile3 + 1.5 * interquantile_range # datapoint above this value is an outlier
    low_limit = quartile1 - 1.5 * interquantile_range # datapoint below this valueis an outlier
    return low_limit, up_limit

#the q1 is also included in q3 since q3 counts all values below its specified limit
#the above function is used because the defalt 25% and 75% values are not suitable for this data

In [55]:
outlier_thresholds(df2, 'Country_encoded', q1=0.1, q3=0.99)

(-8337.5, 15538.5)

In [56]:
#this function checks for any outlier present in the given column
def check_outlier(dataframe, col_name, q1=0.1, q3=0.99):
    
    #calculating thresholds
    low_limit, up_limit = outlier_thresholds(dataframe, col_name, q1, q3)
    
    #dataframe[...]=selects each row if condition is true for the row
    #OR returns true if one or both conditions are true
    #any(axis=None)=This checks if any element in the resulting DataFrame is True.
    #If there is at least one True value, it returns True; otherwise, it returns False.
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [57]:
col = ['Country_encoded','Status_encoded','Age_encoded','source_encoded']
for i in col:
    #check for outliers in the given column
    print(i,":", check_outlier(df2,i))

Country_encoded : False
Status_encoded : False
Age_encoded : False
source_encoded : False


revert#correlation heatmap
#Initializes a new figure with an identifier of 5 and sets its size to 25x10 inches. 
#This large size ensures that all elements in the heatmap are clearly visible.
plt.figure(5, figsize=(25, 10))

#Calculating the Correlation Matrix
corr = df2.apply(lambda x: pd.factorize(x)[0]).corr()

#Creating a Mask for the Upper Triangle
#This mask is used to hide the upper triangle of the heatmap, as the correlation matrix is symmetric and the upper triangle is redundant.
#mask = np.triu(np.ones_like(corr, dtype=bool))

#Plotting the Heatmap
ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

In [59]:
df2.to_csv('new_df.csv')

In [60]:
df2.head()

Unnamed: 0_level_0,Data Collection (Start Year),Data Collection (End Year),Sex_Man,Sex_Woman,Country_encoded,Status_encoded,Age_encoded,DataProcess_Census,DataProcess_Dual record,DataProcess_Estimate,DataProcess_GGS,DataProcess_Multiround surv,DataProcess_Survey,source_encoded
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1972,1974,True,False,499,4,9,False,False,False,False,False,True,9
2,1972,1974,True,False,499,4,17,False,False,False,False,False,True,9
3,1972,1974,True,False,499,4,22,False,False,False,False,False,True,9
4,1972,1974,True,False,499,4,26,False,False,False,False,False,True,9
5,1972,1974,True,False,499,4,29,False,False,False,False,False,True,9


In [61]:
num = ['Data Collection (Start Year)', 'Data Collection (End Year)', 'Country_encoded','Age_encoded', 'source_encoded' ]

In [62]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform only the specified columns
df2[num] = scaler.fit_transform(df2[num])


In [63]:
df2.head()

Unnamed: 0_level_0,Data Collection (Start Year),Data Collection (End Year),Sex_Man,Sex_Woman,Country_encoded,Status_encoded,Age_encoded,DataProcess_Census,DataProcess_Dual record,DataProcess_Estimate,DataProcess_GGS,DataProcess_Multiround surv,DataProcess_Survey,source_encoded
Sr.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.28,0.3,True,False,0.07,4,0.15,False,False,False,False,False,True,0.64
2,0.28,0.3,True,False,0.07,4,0.27,False,False,False,False,False,True,0.64
3,0.28,0.3,True,False,0.07,4,0.35,False,False,False,False,False,True,0.64
4,0.28,0.3,True,False,0.07,4,0.42,False,False,False,False,False,True,0.64
5,0.28,0.3,True,False,0.07,4,0.47,False,False,False,False,False,True,0.64


In [64]:
# Assuming 'df' is your DataFrame and you want to scale specific columns
scaler = StandardScaler()

# Fit and transform the desired columns (e.g., numerical columns)
scaled_features = scaler.fit_transform(df2[['Data Collection (Start Year)',	'Data Collection (End Year)', 'Country_encoded', 'Age_encoded','source_encoded']])

# If you want to replace the original columns with the scaled ones:
df2[['Data Collection (Start Year)',	'Data Collection (End Year)', 'Country_encoded', 'Age_encoded','source_encoded']] = scaled_features

# To inverse transform back to the original scale
original_values = scaler.inverse_transform(scaled_features)

In [65]:
X=df2.drop(["Status_encoded"], axis=1)

In [66]:
y = df2['Status_encoded']

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models

# Initialize LazyClassifier (uses all available classification models by default)
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit the classifier and evaluate
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Show the results
print(models)

# Define specific classification models
custom_models = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'DecisionTree': DecisionTreeClassifier(),
    'LightGBM': LGBMClassifier()
}

# Initialize LazyClassifier with custom models
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, models=custom_models)

# Fit the classifier and evaluate
custom_models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Show the results
print(custom_models)

# Initialize the models
models = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=200),
    'SVC': SVC(),
    'DecisionTree': DecisionTreeClassifier(),
    'LightGBM': LGBMClassifier()
}

# Dictionary to store model results
model_results = {}



# RandomForest
start_time = time.time()
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
time_taken = time.time() - start_time
random_forest_model = {'accuracy': accuracy, 'time_taken': time_taken}
print(f"Model: RandomForest")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Time Taken: {time_taken:.4f} seconds\n")

# LogisticRegression
start_time = time.time()
models['LogisticRegression'].fit(X_train, y_train)
y_pred = models['LogisticRegression'].predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
time_taken = time.time() - start_time
model_results['LogisticRegression'] = {'accuracy': accuracy, 'time_taken': time_taken}
print(f"Model: LogisticRegression")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Time Taken: {time_taken:.4f} seconds\n")



# SVC
start_time = time.time()
svc_model = SVC()
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
time_taken = time.time() - start_time
svc_model_result= {'accuracy': accuracy, 'time_taken': time_taken}
print(f"Model: SVC")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Time Taken: {time_taken:.4f} seconds\n")

# DecisionTree
start_time = time.time()
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
y_pred = decision_tree_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
time_taken = time.time() - start_time
decision_tree_model_result = {'accuracy': accuracy, 'time_taken': time_taken}
print(f"Model: DecisionTree")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Time Taken: {time_taken:.4f} seconds\n")

# LightGBM
start_time = time.time()
lightgbm_model = LGBMClassifier()
lightgbm_model.fit(X_train, y_train)
y_pred = lightgbm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
time_taken = time.time() - start_time
lightgbm_model_result = {'accuracy': accuracy, 'time_taken': time_taken}
print(f"Model: LightGBM")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Time Taken: {time_taken:.4f} seconds\n")


# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
start_time = time.time()
logistic_regression_model = LogisticRegression(max_iter=500)
logistic_regression_model.fit(X_train_scaled, y_train)  # Train on scaled data
y_pred = logistic_regression_model.predict(X_test_scaled)  # Predict on scaled test data
accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
time_taken = time.time() - start_time  # Measure time taken

# Store results
logistic_regression_results = {'accuracy': accuracy, 'time_taken': time_taken}

# Print the results
print(f"Model: LogisticRegression")
print(f"  Accuracy: {logistic_regression_results['accuracy']:.4f}")
print(f"  Time Taken: {logistic_regression_results['time_taken']:.4f} seconds\n")

In [78]:
start_time = time.time()
random_forest_model = RandomForestClassifier()



In [79]:
# Initialize KFold with 5 splits (k=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [80]:
# Perform cross-validation using accuracy as the scoring metric
cv_scores = cross_val_score(random_forest_model, X, y, cv=kf, scoring='accuracy')


In [81]:
# Output the individual fold scores and the average score
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")

Cross-Validation Scores: [0.00294545 0.00279818 0.00296386 0.00301909 0.00281664]
Mean Accuracy: 0.00


In [82]:
random_forest_model.fit(X_train, y_train)


In [83]:
y_pred = random_forest_model.predict(X_test)
y_pred_proba = random_forest_model.predict_proba(X_test)[:, 1]
model_score = random_forest_model.score(X_test, y_test)
print(f'Accuracy: {model_score:.2f}')

Accuracy: 0.00


In [84]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))


Accuracy: 0.0029822720494836253
Precision: 0.00855712288688383


In [85]:
time_taken = time.time() - start_time  # Measure time taken
print(f"  Time Taken: {time_taken} seconds\n")

  Time Taken: 295.8394317626953 seconds



start_time = time.time()
lightgbm_model = LGBMClassifier(force_col_wise=True)
lightgbm_model.fit(X_train, y_train)


y_pred = lightgbm_model.predict(X_test)
y_pred_proba = lightgbm_model.predict_proba(X_test)[:, 1]


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))


time_taken = time.time() - start_time  # Measure time taken
print(f"  Time Taken: {time_taken} seconds\n")

y_pred = svc_model.predict(X_test)
y_pred_proba = svc_model.predict_proba(X_test)[:, 1]


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))


time_taken = time.time() - start_time  # Measure time taken
print(f"  Time Taken: {time_taken} seconds\n")

In [93]:
start_time = time.time()
decision_tree_model = DecisionTreeClassifier()
# Initialize KFold with 5 splits (k=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)




In [94]:
# Perform cross-validation using accuracy as the scoring metric
cv_scores = cross_val_score(decision_tree_model, X, y, cv=kf, scoring='accuracy')


In [95]:
# Output the individual fold scores and the average score
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")


Cross-Validation Scores: [0.00287182 0.00276136 0.00289023 0.00270614 0.00259573]
Mean Accuracy: 0.00


In [96]:
decision_tree_model.fit(X_train, y_train)

In [97]:
y_pred = decision_tree_model.predict(X_test)
y_pred_proba = decision_tree_model.predict_proba(X_test)[:, 1]
model_score = decision_tree_model.score(X_test, y_test)
print(f'Accuracy: {model_score:.2f}')

Accuracy: 0.00


In [98]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))


Accuracy: 0.00287181752913238
Precision: 0.010569013411960426


In [99]:
time_taken = time.time() - start_time  # Measure time taken
print(f"  Time Taken: {time_taken} seconds\n")

  Time Taken: 7.184298276901245 seconds



In [100]:
start_time = time.time()
model = LogisticRegression(random_state=0, max_iter=1000)




In [101]:
# Initialize KFold with 5 splits (k=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)



In [102]:
# Perform cross-validation using accuracy as the scoring metric
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')



In [103]:
# Output the individual fold scores and the average score
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")

Cross-Validation Scores: [0.21201745 0.21216472 0.21531268 0.2096795  0.21290501]
Mean Accuracy: 0.21


In [104]:
model.fit(X_train, y_train)

In [105]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
model_score = model.score(X_test, y_test)
print(f'Accuracy: {model_score:.2f}')

Accuracy: 0.21


In [106]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))


Accuracy: 0.21170449733988697
Precision: 0.04893518767950544


In [107]:
time_taken = time.time() - start_time  # Measure time taken
print(f"  Time Taken: {time_taken} seconds\n")

  Time Taken: 540.4283728599548 seconds



In [108]:
# Initialize the XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')



In [109]:
# Initialize KFold with 5 splits (k=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)



In [110]:
# Perform cross-validation using accuracy as the scoring metric
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')



In [111]:
# Output the individual fold scores and the average score
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")

Cross-Validation Scores: [0.1341102  0.1401852  0.13862042 0.13913588 0.14051915]
Mean Accuracy: 0.14


In [112]:
# Fit the model
model.fit(X_train, y_train)

In [113]:
# Predict on the test set
y_pred = model.predict(X_test)
model_score = model.score(X_test, y_test)
print(f'Accuracy: {model_score:.2f}')

Accuracy: 0.13


In [114]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.13


In [115]:
time_taken = time.time() - start_time  # Measure time taken
print(f"  Time Taken: {time_taken} seconds\n")

  Time Taken: 1158.0873036384583 seconds

