In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width',None)
pd.set_option('display.expand_frame_repr',False)

JP=pd.read_csv("agricultural_yield.csv")
print(JP.head(10))
print(JP.isna().sum())
JP=JP[JP.duplicated()]

print(JP.columns)
print(JP.dtypes)

plt.figure(figsize=(14,5))
plt.boxplot(JP['Irrigation_Schedule'])
plt.title(" Outliners Detection",fontweight='bold',color='green',fontsize=40)
plt.show()


Q1=JP['Irrigation_Schedule'].quantile(0.25)
Q3=JP['Irrigation_Schedule'].quantile(0.75)

IQR=Q3-Q1

lower=Q1 - 1.5 * IQR
upper=Q3 + 1.5 * IQR

JP=JP[ ( JP['Irrigation_Schedule'] > lower ) & ( JP['Irrigation_Schedule'] < upper )]

plt.figure(figsize=(14,5))
plt.boxplot(JP['Irrigation_Schedule'])
plt.title("After Removing Outliners",fontweight='bold',color='green',fontsize=40)
plt.show()

plt.figure(figsize=(14,5))
sns.heatmap(JP.corr(),annot=True,cmap='coolwarm')
plt.title("Correlation Between Columns",fontweight='bold',color='purple',fontsize=40)
plt.show()

JP['A']=JP['Irrigation_Schedule'] * JP['Seed_Variety'] * JP['Fertilizer_Amount_kg_per_hectare'] 
JP['B']=JP['Irrigation_Schedule'] * JP['Seed_Variety'] / 100
JP['C']=JP['Soil_Quality'] * JP['Seed_Variety'] 
JP['D']=JP['Soil_Quality'] + JP['Seed_Variety'] + JP['Fertilizer_Amount_kg_per_hectare'] * JP['Irrigation_Schedule'] 
JP['F']=JP['Soil_Quality'] + JP['Irrigation_Schedule'] / JP['Fertilizer_Amount_kg_per_hectare']

x=JP[['Seed_Variety','Irrigation_Schedule','A','B','C','D','F']]
y=JP['Yield_kg_per_hectare']
print(JP.head(10))

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

x_scaled=StandardScaler().fit_transform(x)

model=LinearRegression()

score=cross_val_score(model,x_scaled,y,cv=5,scoring='r2')

print("CV For LinearRegression",score)
print("Average CV For LinearRegression",score.mean())


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error , r2_score

x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.2,random_state=12)

from sklearn.linear_model import LinearRegression

model=LinearRegression()

model.fit(x_train,y_train)
Y_Pred=model.predict(x_test)

Train_Score=model.score(x_train,y_train)
Test_Score=model.score(x_test,y_test)

print("Train Score ==",Train_Score)
print("Test Score",Test_Score)

print("MAE For LinearRegression == ",mean_absolute_error(y_test,Y_Pred))
print("MAE For LinearRegression == ",r2_score(y_test,Y_Pred))

# from sklearn.ensemble import RandomForestRegressor

# model=RandomForestRegressor()

# model.fit(x_train,y_train)
# Y_Pred=model.predict(x_test)

# Train_Score=model.score(x_train,y_train)
# Test_Score=model.score(x_test,y_test)

# print("Train Score ==",Train_Score)
# print("Test Score",Test_Score)

# print("MAE For RandomForestRegressor == ",mean_absolute_error(y_test,Y_Pred))
# print("R2 Score For RandomForestRegressor == ",r2_score(y_test,Y_Pred))


plt.figure(figsize=(16,6))
plt.scatter(y_test,Y_Pred,marker="*",s=30,color='blue',label='Actual + Predicted Yield')
plt.plot( [y_test.min(),y_test.max()],[y_test.min(),y_test.max()],linestyle="--",linewidth=2,color='black')
plt.xlabel("Actual Yield",color='purple',fontsize=20,fontweight='bold')
plt.ylabel("Predicted Yield",color='purple',fontsize=20,fontweight='bold')
plt.title("Final Prediction",fontweight='bold',color='red',fontsize=50)
plt.legend(loc='upper left')
plt.show()