###**IMPORTING LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

###2. Load the dataset into the Google Colab

In [None]:
df=pd.read_csv("/content/sample_data/content/abalone.csv")

In [None]:
df['age'] = df['Rings']+1.5
df = df.drop('Rings', axis = 1)

###3. UNIVARIATE ANALYSIS

In [None]:
df.hist(figsize=(20,10), grid=False, layout=(2, 4), bins = 30)

In [None]:
df.groupby('Sex')[['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'age']].mean().sort_values('age')

### 3. BIVARIATE ANALYSIS & MULTIVARIATE ANALYSIS

In [None]:
numerical_features = df.select_dtypes(include = [np.number]).columns
sns.pairplot(df[numerical_features])

###4. Descriptive statistics 

In [None]:
df.describe()

###5. Check for Missing Values

In [None]:
df.isnull().sum()

###6. OUTLIER HANDLING

In [None]:
df = pd.get_dummies(df)
dummy_data = df.copy()

In [None]:
var = 'Viscera weight'
plt.scatter(x = df[var], y = df['age'],)
plt.grid(True)

In [None]:
# outliers removal
df.drop(df[(df['Viscera weight']> 0.5) & (df['age'] < 20)].index, inplace=True)
df.drop(df[(df['Viscera weight']<0.5) & (df['age'] > 25)].index, inplace=True)

In [None]:
var = 'Shell weight'
plt.scatter(x = df[var], y = df['age'],)
plt.grid(True)
#Outliers removal
df.drop(df[(df['Shell weight']> 0.6) & (df['age'] < 25)].index, inplace=True)
df.drop(df[(df['Shell weight']<0.8) & (df['age'] > 25)].index, inplace=True)

In [None]:
var = 'Shucked weight'
plt.scatter(x = df[var], y = df['age'],)
plt.grid(True)

#Outlier removal
df.drop(df[(df['Shucked weight']>= 1) & (df['age'] < 20)].index, inplace=True)
df.drop(df[(df['Shucked weight']<1) & (df['age'] > 20)].index, inplace=True)

In [None]:
var = 'Whole weight'
plt.scatter(x = df[var], y = df['age'])
plt.grid(True)

df.drop(df[(df['Whole weight'] >= 2.5) &
          (df['age'] < 25)].index, inplace = True)
df.drop(df[(df['Whole weight']<2.5) & (
df['age'] > 25)].index, inplace = True)

In [None]:
var = 'Diameter'
plt.scatter(x = df[var], y = df['age'])
plt.grid(True)

df.drop(df[(df['Diameter'] <0.1) &
          (df['age'] < 5)].index, inplace = True)
df.drop(df[(df['Diameter']<0.6) & (
df['age'] > 25)].index, inplace = True)
df.drop(df[(df['Diameter']>=0.6) & (
df['age'] < 25)].index, inplace = True)

In [None]:
var = 'Height'
plt.scatter(x = df[var], y = df['age'])
plt.grid(True)
df.drop(df[(df['Height'] > 0.4) &
          (df['age'] < 15)].index, inplace = True)
df.drop(df[(df['Height']<0.4) & (
df['age'] > 25)].index, inplace = True)



In [None]:
var = 'Length'
plt.scatter(x = df[var], y = df['age'])
plt.grid(True)

df.drop(df[(df['Length'] <0.1) &
          (df['age'] < 5)].index, inplace = True)
df.drop(df[(df['Length']<0.8) & (
df['age'] > 25)].index, inplace = True)
df.drop(df[(df['Length']>=0.8) & (
df['age'] < 25)].index, inplace = True)

###7. Categorical columns

In [None]:
numerical_features = df.select_dtypes(include = [np.number]).columns
categorical_features = df.select_dtypes(include = [np.object]).columns


In [None]:
numerical_features

In [None]:
categorical_features

###**ENCODING**

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
print(df.Length.value_counts())

###8. Split the dependent and independent variables

In [None]:
x=df.iloc[:,:5]
x

In [None]:
y=df.iloc[:,5:]
y

###9. Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x_train=ss.fit_transform(x_train)

In [None]:
mlrpred=mlr.predict(x_test[0:9])

In [None]:
mlrpred

###10. Train , Test , Split

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

###11. Model building 

In [None]:
from sklearn.linear_model import LinearRegression
mlr=LinearRegression()
mlr.fit(x_train,y_train)

###12 & 13. Train and Test the model

In [None]:
x_test[0:5]

In [None]:
y_test[0:5]

###14. Measure the performance using metrics

In [None]:
from sklearn.metrics import r2_score
r2_score(mlr.predict(x_test),y_test)