In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing the dataset

In [None]:
data=pd.read_csv("../input/insurance/insurance.csv")
data.head()

Understanding the dataset

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
print("no of males and females")
data['sex'].value_counts()

In [None]:
print("no of observations for each region")
data['region'].value_counts()

In [None]:
data.describe(include='all')

## gives the descriptive statistics of all the continuous and categorical variables.

****From the above descriptive stats we could observe:

* All the variables are having same number of observations,implies no null values in any of the variable
* Number of males are more than females.
* People from southeast region are more.
* Non smokers are more in number than smokers.
* For charges,the mean and median(50%) varies much i.e. mean>median ,so there are outliers towards right in normal distribution    curve.so our target variable charges must be right skewed.



In [None]:
import seaborn as sns
sns.distplot(data['charges'],bins=10)
print("looking for the skewness in the curve")

* Outlier management

In [None]:
q=data['charges'].quantile(0.99)
data1=data[data['charges']<q]
data1.describe()

In [None]:
print("checking for the null values")
print(data1.isnull().sum())

**Pre-Assumptions for implementing the linear regression**
* Normality of target variable.
* Linear relationship between continuous input variables and target.
* No Multicollinearity. 

******** 1.Normality of the target variable

In [None]:
print("transforming the target variable to achieve normal distribution")
data1['target']=np.log(data1['charges'])
sns.distplot(data1['target'])

We could see that the curve of charges is being transformed to charges_transformed, where it achieved normal distribution far better.

In [None]:
## dropping the charges variable
data2=data1.drop(['charges'],axis=1)
data2.head()

### EDA and data visualisation.

In [None]:
sns.countplot(x='sex',hue='smoker',data=data2)

* We intend to look at the smokers, we could observe the proportion of male smokers is greater than female smokers.

In [None]:
sns.boxplot(x='smoker',y='age',data=data2)

* The mean values are almost same around 40, so we can say there is no impact of smoking on age.

In [None]:
sns.distplot(data2['age'],bins=10)

* The age variable is not that greatly normally distributed, but it's okay to consider it.

In [None]:
sns.distplot(data2['bmi'],bins=10)

* The bmi is perfectly normally distributed.
* lets check for its relationship with our target variable charges_transformed

**2.Linear relationship of input variables with target variable**

In [None]:
sns.regplot(x='bmi',y='target',data=data2)

In [None]:
sns.regplot(x='age',y='target',data=data2)

* we could see a linear relationship between age,target and bmi,target

* lets check the relationship between the target and remaining categorical variables 

In [None]:
sns.boxplot(x='sex',y='target',data=data2)

we could see that the mean of both male and female are same, so we could say that sex has no impact on our target.

In [None]:

sns.boxplot(x='smoker',y='target',data=data2)

Here the smoker has influence on our target as their means are not same.

In [None]:
sns.boxplot(x='children',y='target',data=data2)

Children also has impact on our target variable as means are not same , so it should be taken in our regression.

In [None]:
sns.boxplot(x='region',y='target',data=data2)

THe means of various regions are almost same , so not that great impact on our target variable but may be some effect.

***3.checking for multicollinearity*********************

In [None]:
data2.columns

In [None]:
data2.corr()

As no variable has correlation>0.6 and correlation<-0.6 we could take these variables and they dont have multicollinearity

* One hot encoding(creating dummies)

In [None]:
data_preprocessed=pd.get_dummies(data2,drop_first=True)
data_preprocessed.head()

****The variables discovered in relation with our target are:
* age
* region
* bmi
* smoker
* children

## Linear Regression Model

In [None]:
Y=data_preprocessed['target']
X=data_preprocessed.drop(['target'],axis=1)

> Scaling the data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)

Train-test split

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X_scaled,Y,test_size=0.2,random_state=364)

# Regression with Statsmodel 

In [None]:
import statsmodels.api as sm
inputs=sm.add_constant(x_train)
results=sm.OLS(y_train,inputs).fit()
results.summary()

Seems the model built is very good as both r-squared and adj.r-squared are much nearer which is recommended, interpret the above results and change the variables accordingly

# Sklearn Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(x_train,y_train)

In [None]:
y_hat=reg.predict(x_train)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(y_train,y_hat)
plt.xlabel(" y_train",size=18)
plt.ylabel("predicted y_train",size=18)
plt.show()