In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
#Simple Linear Regression model for Population prediction
#First read the dataset as a Pandas Dataframe
indiadf = pd.read_csv("Data/population.csv", encoding = "latin-1")

#Set the index as the column['States/UT']
indiadf = indiadf.set_index('State/UT' )

#Take the transpose
indiadfT = indiadf.T

#Create a new column 'Years' which is now equal to the years we have the data of
indiadfT['Year'] = indiadfT.index

#Select all the columns from the dataframe
col = list(indiadfT.columns)

#Set the variable 'model' as an object of LinearRegression from sklearn
model = LinearRegression()
for i in range(0,34):    #34 since this table has 34 columns
    
    #Fit the model with one parameter as the 'years' column and the other as the state we want in an incremental fashion
    model.fit(indiadfT[['Year']], indiadfT.iloc[:,i])
    
    #Create a new column to story the predicted values for data plotting at a later stage
    indiadfT['Predicted_{}'.format(col[i])] = model.predict(indiadfT[['Year']])
    
    #Predict 2020 values(You can predict any year you wish to simply by replacing the year with a year of your choice)
    pred_2020 = model.predict([[2020]])
    pred_2020 = int(pred_2020)
    
    #Predict 2050 values
    pred_2050 = model.predict([[2050]])
    pred_2050 = int(pred_2050)
    
    #Calculate growth percentage from 2020-2050
    growth = (pred_2050 - pred_2020)/pred_2020
    
    #Print the output statewise
    print("The expected population of {} in 2050 is {} with a change of {:.3f}%.".format(col[i], pred_2050,growth))

#Comments remain the same for all the models below since it's the same pattern but different datasets.

In [None]:
#Line plot for 3 cities' population
plt.plot(
    indiadfT['Year'], indiadfT.iloc[0:, 5], 'b-x',
    indiadfT['Year'], indiadfT.iloc[0:, 7], 'r-x',
    indiadfT['Year'], indiadfT.iloc[0:, 10], 'g-x',
    indiadfT['Year'], indiadfT['Predicted_Chandigarh'], 'y-',
    indiadfT['Year'], indiadfT['Predicted_Delhi'], 'y-',
    indiadfT['Year'], indiadfT['Predicted_Haryana'], 'y-'
)
plt.grid(True)
plt.xlabel('Year')
plt.ylabel('Population Estimates in 2050')
plt.legend([indiadfT.columns[5],indiadfT.columns[7],indiadfT.columns[10], 'Best Fit Line'])
plt.show()

In [None]:
#Simple Linear Regression model for Sex Ratio prediction
indiadf = pd.read_csv("Data/sex_ratio.csv", encoding = "latin-1")
indiadf = indiadf.set_index('State/UT' )
indiadfT = indiadf.T
indiadfT['Year'] = indiadfT.index
col = list(indiadfT.columns)
model = LinearRegression()
for i in range(0,34):
    model.fit(indiadfT[['Year']], indiadfT.iloc[:,i])
    indiadfT['Predicted_{}'.format(col[i])] = model.predict(indiadfT[['Year']])
    pred_2020 = model.predict([[2020]])
    pred_2020 = int(pred_2020)
    pred_2050 = model.predict([[2050]])
    pred_2050 = int(pred_2050)
    growth = (pred_2050 - pred_2020)/pred_2020
    print("The expected sex-ratio of {} in 2050 is {} with a change of {:.3f}%.".format(col[i], pred_2050,growth))

In [None]:
#Bar plot for sex ratios in 2001 and 2011 for Andaman-Lakshadweep
indiadf.iloc[:17, :].plot.bar(y = {'2001', '2011'}, rot = 90)

In [None]:
#Bar plot for sex ratios in 2001 and 2011 for remaining states
indiadf.iloc[17:, :].plot.bar(y = {'2001', '2011'}, rot = 90)

In [None]:
#Simple Linear Regression model for Life Expectancy prediction
indiadf = pd.read_csv("Data/expectancy.csv", encoding = "latin-1")
indiadf = indiadf.set_index('State/UT' )
indiadfT = indiadf.T
indiadfT['Year'] = indiadfT.index
col = list(indiadfT.columns)
model = LinearRegression()
for i in range(0,22):
    model.fit(indiadfT[['Year']], indiadfT.iloc[:,i])
    indiadfT['Predicted_{}'.format(col[i])] = model.predict(indiadfT[['Year']])
    pred_2020 = model.predict([[2020]])
    pred_2020 = int(pred_2020)
    pred_2050 = model.predict([[2050]])
    pred_2050 = int(pred_2050)
    growth = (pred_2050 - pred_2020)/pred_2020
    print("The life expectancy of {} in 2050 is {} with a change of {:.3f}%.".format(col[i], pred_2050,growth))

In [None]:
plt.plot(
    indiadfT['Year'], indiadfT.iloc[0:, 21], 'b-x',
    indiadfT['Year'], indiadfT['Predicted_ALL INDIA'], 'y-'
)
plt.grid(False)
plt.xlabel('Year')
plt.ylabel('Life Expectancy Estimates in 2050')
plt.legend([indiadfT.columns[21], 'Best Fit Line'])
plt.show()

In [None]:
#Simple Linear Regression model for Population Distribution prediction
indiadf = pd.read_csv("Data/pop_dist.csv", encoding = "latin-1")
indiadf = indiadf.set_index('State/UT' )
indiadfT = indiadf.T
indiadfT['Year'] = indiadfT.index
col = list(indiadfT.columns)
model = LinearRegression()
for i in range(0,23):
    model.fit(indiadfT[['Year']], indiadfT.iloc[:,i])
    indiadfT['Predicted_{}'.format(col[i])] = model.predict(indiadfT[['Year']])
    pred_2020 = model.predict([[2020.4]])
    pred_2020 = int(pred_2020)
    pred_2050 = model.predict([[2050.4]])
    pred_2050 = int(pred_2050)
    growth = (pred_2050 - pred_2020)/pred_2020
    print("The estimated age distribution in the bracket 20-24 of {} in 2050 is {} with a change of {:.3f}%.".format(col[i], pred_2050,growth))

In [None]:
#Horizontal bar plot for 2001,2006 and 2011 in the age group of 20-24 for half the states
indiadf.iloc[1:12,:].plot.barh(y = {'2001.4', '2006.4', '2011.4'}, rot = 0)

In [None]:
#Horizontal bar plot for 2001,2006 and 2011 in the age group of 20-24 for other half of the states
indiadf.iloc[12:,:].plot.barh(y = {'2001.4', '2006.4', '2011.4'}, rot = 0)