In [158]:
import numpy as np
import pandas as pd

In [159]:
birth_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dataSets/births.csv")
birth_data.columns

Index(['year', 'month', 'day', 'gender', 'births'], dtype='object')

In [160]:
birth_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15547 entries, 0 to 15546
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    15547 non-null  int64  
 1   month   15547 non-null  int64  
 2   day     15067 non-null  float64
 3   gender  15547 non-null  object 
 4   births  15547 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 607.4+ KB


In [161]:
temp = birth_data["year"].astype('string')
temp

0        1969
1        1969
2        1969
3        1969
4        1969
         ... 
15542    2008
15543    2008
15544    2008
15545    2008
15546    2008
Name: year, Length: 15547, dtype: string

Q1: Adding Decades column

In [163]:
birth_data["Decades"] = temp.apply(lambda x : x[:-1]).apply(lambda x : x + '0').astype("int64")
birth_data.tail()

Unnamed: 0,year,month,day,gender,births,Decades
15542,2008,10,,M,183219,2000
15543,2008,11,,F,158939,2000
15544,2008,11,,M,165468,2000
15545,2008,12,,F,173215,2000
15546,2008,12,,M,181235,2000


Q2:  Descriptive statistics of the data

In [164]:
birth_data.describe()

Unnamed: 0,year,month,day,births,Decades
count,15547.0,15547.0,15067.0,15547.0,15547.0
mean,1979.037435,6.515919,17.769894,9762.293561,1974.544285
std,6.72834,3.449632,15.284034,28552.46581,6.789583
min,1969.0,1.0,1.0,1.0,1960.0
25%,1974.0,4.0,8.0,4358.0,1970.0
50%,1979.0,7.0,16.0,4814.0,1970.0
75%,1984.0,10.0,24.0,5289.5,1980.0
max,2008.0,12.0,99.0,199622.0,2000.0


Q3: Check if your data contains any missing values

In [165]:
def count_nan(df):
    """"Returns a dict of columns name as keys and count of missing values in the column"""
    bool_df = df.isnull()
    missing_val = [bool_df[column].astype("int64").sum() for column in bool_df.columns]
    cols = df.columns
    return dict(zip(cols, missing_val))

In [166]:
count_nan(birth_data)

{'year': 0, 'month': 0, 'day': 480, 'gender': 0, 'births': 0, 'Decades': 0}

Q4: What is the trend of male & female births every decade?


In [167]:
trend = birth_data.groupby(["Decades", "gender"]).sum().reset_index()
trend_male = trend[trend["gender"]=='M']
trend_female = trend[trend["gender"]=='F']

In [168]:
import plotly.express as px
fig = px.bar(trend_male, x='Decades', y='births', title='Male Births')
fig.show()

In [169]:
import plotly.express as px
fig = px.bar(trend_female, x='Decades', y='births', title='Female Births')
fig.show()

Q5: To remove outliers from dataset following techinque to include only those values that fall within 5
standard deviations from the mean.This is a common statistical technique used to focus on the central
tendency of the data while excluding extreme values.
Use this technique to remove outliers.


In [170]:
outlier_removed_df = pd.DataFrame({})
for column in birth_data.columns:
    if birth_data[column].dtype != "object":
        mean = birth_data[column].mean()
        std = birth_data[column].std()

        lower_limit = mean - 5 * std
        upper_limit = mean + 5 * std
        outlier_removed_df[column] = birth_data[(birth_data[column] >= lower_limit) & (birth_data[column] <= upper_limit)][column]
    else:
        outlier_removed_df[column] = birth_data[column]

outlier_removed_df.dropna(inplace=True)
outlier_removed_df

Unnamed: 0,year,month,day,gender,births,Decades
0,1969,1,1.0,F,4046.0,1960
1,1969,1,1.0,M,4440.0,1960
2,1969,1,2.0,F,4454.0,1960
3,1969,1,2.0,M,4548.0,1960
4,1969,1,3.0,F,4548.0,1960
...,...,...,...,...,...,...
15062,1988,12,29.0,M,5944.0,1980
15063,1988,12,30.0,F,5742.0,1980
15064,1988,12,30.0,M,6095.0,1980
15065,1988,12,31.0,F,4435.0,1980


In [171]:
count_nan(outlier_removed_df)

{'year': 0, 'month': 0, 'day': 0, 'gender': 0, 'births': 0, 'Decades': 0}

Q6: Plot births by weekday for several decades. Write down your observation.


In [172]:
x = outlier_removed_df.groupby(["day","Decades"]).sum(numeric_only=True).reset_index()

In [173]:
import plotly.express as px
fig = px.bar(x, x='day', y='births')
fig.show()

Looking At the distribution it looks like 31st day has significantly less birth rate. it is due to 31st occuring alternately in a year.

Q7: Group the data by month and day separately

In [177]:
grouped_data = birth_data.groupby(["month","day"])["births"].mean().reset_index()
grouped_data.head()

Unnamed: 0,month,day,births
0,1,1.0,4009.225
1,1,2.0,4247.4
2,1,3.0,4500.9
3,1,4.0,4571.35
4,1,5.0,4603.625


Q8: Focusing on the month and day only, you have a time series reflecting the average number of
births by date of the year. From this, plot the data.

In [178]:
grouped_data = grouped_data[(grouped_data["day"]>=1) & (grouped_data["day"]<=31)]
grouped_data['Date'] = pd.to_datetime(grouped_data[['month', 'day']].assign(year=2012), errors='coerce')
grouped_data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,month,day,births,Date
0,1,1.0,4009.225,2012-01-01
1,1,2.0,4247.400,2012-01-02
2,1,3.0,4500.900,2012-01-03
3,1,4.0,4571.350,2012-01-04
4,1,5.0,4603.625,2012-01-05
...,...,...,...,...
378,12,27.0,4850.150,2012-12-27
379,12,28.0,5044.200,2012-12-28
380,12,29.0,5120.150,2012-12-29
381,12,30.0,5172.350,2012-12-30


In [179]:
import plotly.express as px
fig = px.line(grouped_data, x="Date", y="births")
fig.show()