In [2]:
import seaborn as sns ## Visualization
import numpy as np ## Linear Algebra
import pandas as pd ## To work with data
import plotly.express as px ## Visualization
import plotly.graph_objects as go ## Visualization
import matplotlib.pyplot as plt ## Visualization
import plotly as py ## Visuaization
from plotly import tools ## Visualization
import os

In [4]:
df=pd.read_csv("/content/crop_production.csv")

In [None]:
df.head()

# Data Cleaning

In [8]:
df.shape

(246091, 7)

In [None]:
df.info()

We have 4 categorical attributes, namely State_Name, District_Name, Season, Crop. The other 3 attributes are numericals values with Area in hectare and Production in tonnes.

In [None]:
df.isnull().sum()

In [None]:
df.Production.isnull().sum()/df.shape[0]

The Production attribute has 3730 missing values which contributes to just 1.51% of the entire dataset. Hence, we can drop these samples.

In [None]:
df.dropna(subset=["Production"],axis=0,inplace=True)
df.isnull().sum()

# Checking for any correlation between the numerical attributes.

In [None]:
plt.tick_params(labelsize=10)
sns.heatmap(df.corr(),annot=True);

There is no considerable correlation between any of the attributes in the dataset.

# Individual Attribute Analysis

## State_Name

In [None]:
df.State_Name.unique()

In [None]:
df.State_Name.nunique()

In [33]:
df.State_Name.value_counts().head(10)

Uttar Pradesh     33189
Madhya Pradesh    22604
Karnataka         21079
Bihar             18874
Assam             14622
Odisha            13524
Tamil Nadu        13266
Maharashtra       12496
Rajasthan         12066
Chhattisgarh      10368
Name: State_Name, dtype: int64

This dataset encodes agriculture data for 33 Indian states (including Union Territories). As seen above, we have more data from states like Uttar Pradesh, Madhya Pradesh and Karnataka than the other states.

## District_Name

In [None]:
df.District_Name.unique()

In [None]:
df.District_Name.nunique()

In [None]:
df.District_Name.value_counts()

On District front, we have 646 districts with more data coming from Tumkur, Belgaum, Bijapur, Hassan and Bellary, from Karantaka.

## Crop_Year

In [None]:
df.Crop_Year.nunique()

In [None]:
print(df.Crop_Year.min())
print(df.Crop_Year.max())

The dataset contains agricultural information for 19 years from the year 1997 to 2015.

In [None]:
df.Crop_Year.value_counts()

With most of the data records coming from the years 2002-2011.

## Season

In [None]:
df.Season.unique()

In [None]:
df.Season.nunique()

In [None]:
df.Season.value_counts()

Dataset talks about 6 different season crops with more data points including Kharif, Rabi and Annual Crops.

## Crop

In [None]:
df.Crop.unique()

In [38]:
df.Crop.nunique()

124

In [37]:
df.Crop.value_counts()

Rice                 15082
Maize                13787
Moong(Green Gram)    10106
Urad                  9710
Sesamum               8821
                     ...  
Litchi                   6
Coffee                   6
Apple                    4
Peach                    4
Other Dry Fruit          1
Name: Crop, Length: 124, dtype: int64

The dataset shows data for 124 different crop varieties. We can see that the top crops are Rice,Maize and Moong(Green Gram).

## Area

In [None]:
df.Area.describe()

In [None]:
df.Area.value_counts().head(10)

Area under cultivation ranges from 0.1 hectare to 8580100 hectare, with an average area being 12167.41 hectare. A lot of the farmers having a cultivation land area of 1 to 10 hectare.

In [None]:
plt.boxplot(df.Area)

Looking at the distribution we can say that the attribute is highly skewed with quite a few outliers.

## Production

In [None]:
df.Production.describe()

Production values range from 0 to 1250800000 tonnes, with an average production being 582503.4 tonnes.

In [None]:
plt.boxplot(df.Production)

Looking at the distribution we can say that the attribute is highly skewed with quite a few outliers.

# EDA

## Overall Crop Production by State

In [None]:
temp = df.groupby(by='State_Name')['Production'].sum().reset_index().sort_values(by='Production')
px.bar(temp, 'State_Name', 'Production')

From above graph we can see that : 

*   Kerala is the highest crops producing state overall. It has produced more than 500% crop than the runner up state, Andhra Pradesh.
*   Top 3 crop producing states are from south India, which put together leave no space to compare rest states.

## Productivity by State

In [None]:
temp = df.groupby('State_Name')['Area', 'Production'].sum().reset_index()
temp['Production_Per_Unit_Area'] = temp['Production']/temp['Area']
temp = temp.sort_values(by='Production_Per_Unit_Area')
px.bar(temp, 'State_Name', 'Production_Per_Unit_Area', color='Production_Per_Unit_Area', )

Above graph tells us that :

*   Kerala is the most productive state when we compare in terms of production by area.
*   We see Andaman and Nicobar islands, Puducherry, Goa and many other states which are low in overall production, have higher productivity when we compare with the crop areas.

## Overall Production through the years

In [None]:
temp = df.groupby(by='Crop_Year')['Production'].sum().reset_index()
px.line(temp, 'Crop_Year', 'Production')

## Average Crop Area through the years

In [None]:
temp = df.groupby(by='Crop_Year')['Area'].mean().reset_index()
px.scatter(temp, 'Crop_Year', 'Area', color='Area', size='Area')

In Above Graph we can see that :

*   Average Crop Area has decresed over the years.
*   We had the lowest Average Crop area in Years 2002 and 2003. (We have comparitively very less data of the year 2015 so, we will not consider that)

## Most and least crop producing Districts

In [None]:
fig = py.subplots.make_subplots(rows=1,cols=2,
                    subplot_titles=('Highest crop producing districts', 'Least overall crop producing districts'))

temp = df.groupby(by='District_Name')['Production'].sum().reset_index().sort_values(by='Production')
temp1 = temp.tail()
trace1 = go.Bar(x= temp1['District_Name'], y=temp1['Production'])

temp1=temp.head()
trace2 = go.Bar(x= temp1['District_Name'], y=temp1['Production'])

fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,2)
fig.show()

## Most Produced Crops

In [None]:
top_crop_pro = df.groupby("Crop")["Production"].sum().reset_index().sort_values(by='Production',ascending=False)
top_crop_pro[:5]

In [None]:
temp = df.groupby(by='Crop')['Production'].sum().reset_index().sort_values(by='Production')
px.bar(temp.tail(), 'Crop', 'Production')

## EDA on Coconut

In [None]:
coc_df = df[df["Crop"]=="Coconut "]
print(coc_df.Season.unique())
print(coc_df.Season.value_counts())

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="Season",y="Production",data=coc_df)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="State_Name",y="Production",data=coc_df)
plt.xticks(rotation=90)
plt.show()

In [None]:
top_coc_pro_dis = coc_df.groupby("District_Name")["Production"].sum().reset_index().sort_values(
    by='Production',ascending=False)
top_coc_pro_dis[:5]
sum_max = top_coc_pro_dis["Production"].sum()
top_coc_pro_dis["precent of production"] = top_coc_pro_dis["Production"].map(lambda x:(x/sum_max)*100)
top_coc_pro_dis[:5]

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="Crop_Year",y="Production",data=coc_df)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.jointplot(x="Area",y="Production",data=coc_df,kind="reg")

Insights from Coconut EDA:

*   Cocunut production is directly proportional to area.
*   Its production is also gradually increasing over a time of period.
*   Production is highest in Kerala.
*   It does not depends on the season.

## EDA on Sugarcane

In [None]:
sug_df = df[df["Crop"]=="Sugarcane"]
print(sug_df.Season.unique())
print(sug_df.Season.value_counts())

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="Season",y="Production",data=sug_df)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="State_Name",y="Production",data=sug_df)
plt.xticks(rotation=90)
plt.show()

In [None]:
top_sug_pro_dis = sug_df.groupby("District_Name")["Production"].sum().reset_index().sort_values(
    by='Production',ascending=False)
top_sug_pro_dis[:5]
sum_max = top_sug_pro_dis["Production"].sum()
top_sug_pro_dis["precent of production"] = top_sug_pro_dis["Production"].map(lambda x:(x/sum_max)*100)
top_sug_pro_dis[:5]

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="Crop_Year",y="Production",data=sug_df)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.jointplot(x="Area",y="Production",data=sug_df,kind="reg")

Insights from Sugarcane EDA:

*   Sugarecane production is directly proportional to area
*   It is produced in considerably high quantities in many states.

## EDA on Rice

In [None]:
rice_df = df[df["Crop"]=="Rice"]
print(rice_df.Season.unique())
print(rice_df.Season.value_counts())

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="Season",y="Production",data=rice_df)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="State_Name",y="Production",data=rice_df)
plt.xticks(rotation=90)
plt.show()

In [None]:
top_rice_pro_dis = rice_df.groupby("District_Name")["Production"].sum().reset_index().sort_values(
    by='Production',ascending=False)
top_rice_pro_dis[:5]
sum_max = top_rice_pro_dis["Production"].sum()
top_rice_pro_dis["precent of production"] = top_rice_pro_dis["Production"].map(lambda x:(x/sum_max)*100)
top_rice_pro_dis[:5]

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="Crop_Year",y="Production",data=rice_df)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.jointplot(x="Area",y="Production",data=rice_df,kind="reg")

Insights:

*   Rice is produced throughout the year.
*   Production is directly proportional to Area.
*   India consistently producses a high quantity of Rice every year since 1997 ( since we have less data on 2015, there's a reduciton)
*   The higest Rice producing state is Punjab.

