In [35]:
import pandas as pd # For reading data from different data sources like excel, csv etc
import matplotlib.pyplot as plt # For plottling the data
import numpy as np # It which provides support for large, multi-dimensional arrays and matrices, along with mathematical functions.
%matplotlib inline
# A magic command for Jupyter Notebooks. It ensures that all the plots generated using matplotlib are displayed inline (within the notebook).


In [None]:
df=pd.read_csv('height-weight.csv')
'''

This function reads a CSV (Comma-Separated Values) file and loads its contents into a Pandas DataFrame.
The variable df will store the loaded data as a DataFrame.
A DataFrame is a two-dimensional, tabular data structure in Pandas, similar to a table in a database or a spreadsheet.

'''

In [None]:
df.head()
# it displays the first five rows of the DataFrame by default.

In [None]:
##scatter plot
# This creates a scatter plot with Weight on the x-axis and Height on the y-axis.
# Each point represents a row from the DataFrame (df), showing the relationship between Weight and Height.
plt.scatter(df['Weight'],df['Height'])


plt.xlabel("Weight") # Labels the x-axis as "Weight".
plt.ylabel("Height") # Labels the y-axis as "Height".


#  From the graph we can see that when the weight increases the height also increases,so there is a positive correlation between height and weight

In [None]:
## Finding Correlation
df.corr()
# df.corr(method='pearson')
# method: Specifies the correlation method. Options are:
# 'pearson' (default): Measures linear correlation.
# 'kendall': Measures rank correlation.
# 'spearman': Measures rank correlation (based on ordinal rankings).


# The output is a square matrix (as a DataFrame) where:
# Rows and columns represent the numerical columns in df.
# Each cell contains the correlation coefficient between two variables.


#  OUTPUT
# Diagonal Values (1.0):

# Weight correlates perfectly with itself (1.0), and the same goes for Height.
# This is expected because the correlation of a variable with itself is always 1.0.
# Off-Diagonal Value (0.931142):

# The correlation coefficient between Weight and Height is approximately 0.93.
# Significance:
# This is a strong positive correlation. As Weight increases, Height tends to increase as well.


In [None]:
## Seaborn for visualization
import seaborn as sns
sns.pairplot(df)

# This function creates a grid of scatter plots for all numerical columns in the DataFrame (df).
# It shows pairwise relationships between variables and includes histograms or KDE plots on the diagonal by default.

In [41]:
## Independent and dependent features


### independent features should be data frame or 2 dimesnionalarray
# X: Independent features (inputs or predictors) that influence the dependent variable.
# The double brackets ([['Weight']]) ensure X is a DataFrame (2-dimensional array), as most machine learning libraries like Scikit-Learn require this format for independent features.
X=df[['Weight']] 


## this variable can be in series or 1d array
# y: Dependent feature (target or output) that the model will predict.
# This can be a Pandas Series (1-dimensional array) or NumPy array.
y=df['Height'] 


In [None]:
X


In [None]:
y

In [None]:

X_series=df['Weight'] # Extracts the Weight column as a Pandas Series (1D structure).
np.array(X_series).shape # Converts the Pandas Series into a NumPy array.


In [None]:
np.array(y).shape # Converts the Pandas Series into a NumPy array.

In [None]:
## Train Test Split
from sklearn.model_selection import train_test_split

'''
Purpose:
train_test_split is used to split the dataset into two parts:

Training set: Used to train the machine learning model.
Test set: Used to evaluate the model's performance on unseen data.

'''


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)


'''

1. train_test_split:

Splits the dataset (X for features and y for target) into training and testing sets.


2. test_size=0.25:

This parameter defines the proportion of the dataset to include in the test split.
In this case, 25% of the data will be used for testing, and the remaining 75% will be used for training.



3. random_state=42:

Ensures reproducibility of the data split. By using the same random_state value (e.g., 42), you ensure the dataset is split in the same way each time you run the code.

'''

In [None]:
## Standardization
from sklearn.preprocessing import StandardScaler

# StandardScaler from sklearn.preprocessing is used to standardize the features (independent variables).

'''

1. What is Standardization?

Standardization (also known as Z-score normalization) is the process of scaling the features so that they have a mean of 0 and a standard deviation of 1.

It’s important when the features have different units or magnitudes (e.g., height in centimeters and weight in kilograms).

Standardization helps machine learning algorithms, especially those relying on distance metrics (e.g., linear regression, k-nearest neighbors, and gradient descent), to work efficiently.


2. When to Use Standardization:

Use it when your features are measured in different scales (e.g., height in meters and weight in kilograms) or when algorithms rely on distance-based calculations.

'''


In [None]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)

'''

1. scaler = StandardScaler():

Initializes the StandardScaler object, which will be used to standardize the data.


2. X_train = scaler.fit_transform(X_train):

fit: This calculates the mean and standard deviation of the X_train dataset.

transform: This scales the data by subtracting the mean and dividing by the standard deviation. This results in a dataset where the mean is 0 and the standard deviation is 1.

Note: The transformation is applied in-place, meaning X_train is updated with the standardized values.

'''

In [None]:
X_test=scaler.transform(X_test)


'''

1. transform is applied to the X_test data using the previously computed mean and standard deviation from the training set (X_train).


2. This ensures that the test data is scaled using the same parameters as the training data, which is important to avoid data leakage. In machine learning, you should only use information from the training set to scale or transform the test data.


3. Why is This Important?

The test set should remain "unseen" during the training process, including when performing transformations like standardization. By using the transform method, we ensure that the test data is standardized using the same scale as the training data.

'''

In [None]:
X_test

In [None]:
## Apply Simple Linear Regression
from sklearn.linear_model import LinearRegression

'''

Purpose:
This line imports the LinearRegression model from the sklearn.linear_model module, which is a part of scikit-learn.


'''

In [56]:
regression=LinearRegression(n_jobs=-1)
# Initializing the LinearRegression object
# n_jobs=-1 doing so it will utilise all my computer processors

In [None]:
regression.fit(X_train,y_train)

In [None]:
print("Coefficient or slope:",regression.coef_)
print("Intercept:",regression.intercept_)

In [None]:
## plot Training data plot best fit line
plt.scatter(X_train,y_train)
plt.plot(X_train,regression.predict(X_train))

### prediction of test data
1. predicted height output= intercept +coef_(Weights)
2. y_pred_test =156.470 + 17.29(X_test)

In [60]:
## Prediction for test data
y_pred=regression.predict(X_test)

In [61]:
## Performance Metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [None]:
mse=mean_squared_error(y_test,y_pred)
mae=mean_absolute_error(y_test,y_pred)
rmse=np.sqrt(mse)
print(mse)
print(mae)
print(rmse)

## R square 
Formula

**R^2 = 1 - SSR/SST**


R^2	=	coefficient of determination
SSR	=	sum of squares of residuals
SST	=	total sum of squares

In [63]:
from sklearn.metrics import r2_score

In [None]:
score=r2_score(y_test,y_pred)
print(score)

**Adjusted R2 = 1 – [(1-R2)*(n-1)/(n-k-1)]**

where:

R2: The R2 of the model
n: The number of observations
k: The number of predictor variables

In [None]:
#display adjusted R-squared
1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

In [None]:
##  Linear Regression using OLS
!pip install statsmodels


In [66]:
import statsmodels.api as sm

In [67]:
model=sm.OLS(y_train,X_train).fit()

In [None]:
prediction=model.predict(X_test)
print(prediction)

In [None]:
print(model.summary())

In [None]:
## Prediction For new data
regression.predict(scaler.transform([[72]]))