In [1]:
import pandas as pd
import numpy as np

In [2]:
# Assuming your dataset is named 'df'
df = pd.read_csv("Suicides_in_India.csv")
df = df.drop(columns = 'index')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237519 entries, 0 to 237518
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   State      237519 non-null  object
 1   Year       237519 non-null  int64 
 2   Type_code  237519 non-null  object
 3   Type       237519 non-null  object
 4   Gender     237519 non-null  object
 5   Age_group  237519 non-null  object
 6   Total      237519 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 12.7+ MB


In [3]:
sampled_df = df.sample(n=5000, random_state=42)

# If you want a systematic sample
# Calculate the step size to achieve approximately 2000 entries
step = len(df) // 2000
systematic_sampled_df = df.iloc[::step]

# Filter the sampled DataFrame to include only rows where 'Total' is 0 or 1
sampled_df = sampled_df[(sampled_df['Total'] == 0) | (sampled_df['Total'] == 1)]

# Display the sampled DataFrame
print(sampled_df)
print(systematic_sampled_df)

                    State  Year             Type_code  \
1398        A & N ISLANDS  2002  Professional_Profile   
115170        LAKSHADWEEP  2004  Professional_Profile   
66738                 GOA  2002  Professional_Profile   
201159             SIKKIM  2006         Means_adopted   
168186           NAGALAND  2004                Causes   
...                   ...   ...                   ...   
176647             SIKKIM  2010  Professional_Profile   
142862            MANIPUR  2004         Means_adopted   
105764          KARNATAKA  2011         Means_adopted   
143370            MANIPUR  2011                Causes   
17858   ARUNACHAL PRADESH  2009      Education_Status   

                                           Type  Gender Age_group  Total  
1398                             Retired Person    Male     45-59      0  
115170        Self-employed (Business activity)    Male     30-44      0  
66738                            Retired Person    Male      0-14      0  
201159         

In [4]:
#conversion of categorical data to numerical data 
sampled_df['Gender'] = sampled_df['Gender'].astype('category')
sampled_df['Gender'] = sampled_df['Gender'].cat.codes
sampled_df['Type'] = sampled_df['Type'].astype('category')
sampled_df['Type'] = sampled_df['Type'].cat.codes
sampled_df['Type'] = sampled_df['Type'].astype('category')
sampled_df['Type'] = sampled_df['Type'].cat.codes
sampled_df

Unnamed: 0,State,Year,Type_code,Type,Gender,Age_group,Total
1398,A & N ISLANDS,2002,Professional_Profile,58,1,45-59,0
115170,LAKSHADWEEP,2004,Professional_Profile,59,1,30-44,0
66738,GOA,2002,Professional_Profile,58,1,0-14,0
201159,SIKKIM,2006,Means_adopted,13,0,45-59,0
168186,NAGALAND,2004,Causes,21,0,45-59,0
...,...,...,...,...,...,...,...
176647,SIKKIM,2010,Professional_Profile,58,1,0-14,0
142862,MANIPUR,2004,Means_adopted,16,0,0-14,0
105764,KARNATAKA,2011,Means_adopted,9,1,60+,1
143370,MANIPUR,2011,Causes,0,1,45-59,0


In [5]:
#check if the sampled df has any null values
sampled_df.isnull().sum()

State        0
Year         0
Type_code    0
Type         0
Gender       0
Age_group    0
Total        0
dtype: int64

In [6]:
sampled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3220 entries, 1398 to 17858
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   State      3220 non-null   object
 1   Year       3220 non-null   int64 
 2   Type_code  3220 non-null   object
 3   Type       3220 non-null   int8  
 4   Gender     3220 non-null   int8  
 5   Age_group  3220 non-null   object
 6   Total      3220 non-null   int64 
dtypes: int64(2), int8(2), object(3)
memory usage: 157.2+ KB


In [7]:
X = sampled_df.drop(columns = ['State','Year','Age_group','Type_code','Total'])
X

Unnamed: 0,Type,Gender
1398,58,1
115170,59,1
66738,58,1
201159,13,0
168186,21,0
...,...,...
176647,58,1
142862,16,0
105764,9,1
143370,0,1


In [8]:
y = sampled_df['Total']
y

1398      0
115170    0
66738     0
201159    0
168186    0
         ..
176647    0
142862    0
105764    1
143370    0
17858     0
Name: Total, Length: 3220, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train, y_test = train_test_split(X, y ,test_size = 0.3, random_state = 21)
#30% data for testing, 70% for training that's why test_size = 0.3

In [10]:
X_train

Unnamed: 0,Type,Gender
139679,57,1
68088,11,0
184955,16,1
57177,3,1
113820,33,1
...,...,...
137238,60,1
236899,11,0
17299,17,0
78595,15,1


In [11]:
X_test

Unnamed: 0,Type,Gender
652,10,1
56069,0,0
120707,58,1
102814,24,1
156773,46,0
...,...,...
225079,8,0
105943,19,1
94959,15,1
153796,19,0


In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()

In [14]:
X_train_scaled = scaler.fit_transform(X_train)

In [15]:
#this is done so the train data doesn't know about the SD and mean values(z = (x - u) / s)
X_test_scaled = scaler.transform(X_test)

In [16]:
X_train_scaled

array([[ 1.22176374,  1.05943879],
       [-1.11895489, -0.94389597],
       [-0.86452895,  1.05943879],
       ...,
       [-0.81364376, -0.94389597],
       [-0.91541414,  1.05943879],
       [-1.42426601,  1.05943879]])

In [17]:
X_test_scaled

array([[-1.16984008,  1.05943879],
       [-1.67869195, -0.94389597],
       [ 1.27264892,  1.05943879],
       ...,
       [-0.91541414,  1.05943879],
       [-0.71187339, -0.94389597],
       [ 0.86556742, -0.94389597]])

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
log_reg = LogisticRegression(random_state = 0).fit(X_train_scaled,y_train)

In [20]:
log_reg.predict(X_train_scaled)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
#percentage rate of predicting the train values correctly 
log_reg.score(X_train_scaled,y_train)

0.8957409050576752

In [22]:
#percentage rate of predicting the test values correctly 
log_reg.score(X_test_scaled,y_test)

0.8788819875776398

In [23]:
#done to check if the model can be improved or not
log_reg1 = LogisticRegression(random_state = 0,
                             C = 1,
                             fit_intercept = True,
                             ).fit(X_train_scaled,y_train)

In [24]:
log_reg1.score(X_train_scaled,y_train)

0.8957409050576752

In [25]:
log_reg1.score(X_train_scaled,y_train)

0.8957409050576752