Importing the Dependencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Processing

In [3]:
# loading the data 
data = pd.read_csv('/content/riceClassification.csv')

In [4]:
print(data)

          id  Area  MajorAxisLength  MinorAxisLength  Eccentricity  \
0          1  4537        92.229316        64.012769      0.719916   
1          2  2872        74.691881        51.400454      0.725553   
2          3  3048        76.293164        52.043491      0.731211   
3          4  3073        77.033628        51.928487      0.738639   
4          5  3693        85.124785        56.374021      0.749282   
...      ...   ...              ...              ...           ...   
18180  18181  5853       148.624571        51.029281      0.939210   
18181  18182  7585       169.593996        58.141659      0.939398   
18182  18183  6365       154.777085        52.908085      0.939760   
18183  18184  5960       151.397924        51.474600      0.940427   
18184  18185  6134       153.081981        51.590606      0.941500   

       ConvexArea  EquivDiameter    Extent  Perimeter  Roundness  \
0            4677      76.004525  0.657536    273.085   0.764510   
1            3015      

In [8]:
# print the first 5 rows of the dataframe
data.head()

Unnamed: 0,id,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
0,1,4537,92.229316,64.012769,0.719916,4677,76.004525,0.657536,273.085,0.76451,1.440796,1
1,2,2872,74.691881,51.400454,0.725553,3015,60.471018,0.713009,208.317,0.831658,1.453137,1
2,3,3048,76.293164,52.043491,0.731211,3132,62.296341,0.759153,210.012,0.868434,1.46595,1
3,4,3073,77.033628,51.928487,0.738639,3157,62.5513,0.783529,210.657,0.870203,1.483456,1
4,5,3693,85.124785,56.374021,0.749282,3802,68.571668,0.769375,230.332,0.874743,1.51,1


In [11]:
# print last 5 rows of the dataframe
data.tail()

Unnamed: 0,id,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
18180,18181,5853,148.624571,51.029281,0.93921,6008,86.326537,0.498594,332.96,0.663444,2.912535,0
18181,18182,7585,169.593996,58.141659,0.939398,7806,98.272692,0.647461,385.506,0.641362,2.91691,0
18182,18183,6365,154.777085,52.908085,0.93976,6531,90.023162,0.561287,342.253,0.682832,2.925396,0
18183,18184,5960,151.397924,51.4746,0.940427,6189,87.112041,0.492399,343.371,0.635227,2.941216,0
18184,18185,6134,153.081981,51.590606,0.9415,6283,88.374495,0.489975,338.613,0.672274,2.967245,0


In [12]:
# number of rows and columns in the dataset
data.shape

(18185, 12)

In [13]:
# getting some information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18185 entries, 0 to 18184
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               18185 non-null  int64  
 1   Area             18185 non-null  int64  
 2   MajorAxisLength  18185 non-null  float64
 3   MinorAxisLength  18185 non-null  float64
 4   Eccentricity     18185 non-null  float64
 5   ConvexArea       18185 non-null  int64  
 6   EquivDiameter    18185 non-null  float64
 7   Extent           18185 non-null  float64
 8   Perimeter        18185 non-null  float64
 9   Roundness        18185 non-null  float64
 10  AspectRation     18185 non-null  float64
 11  Class            18185 non-null  int64  
dtypes: float64(8), int64(4)
memory usage: 1.7 MB


In [14]:
# checking for missing values
data.isnull().sum()

id                 0
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
EquivDiameter      0
Extent             0
Perimeter          0
Roundness          0
AspectRation       0
Class              0
dtype: int64

In [15]:
# statistical measures about the data
data.describe()

Unnamed: 0,id,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
count,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0
mean,9093.0,7036.492989,151.680754,59.807851,0.915406,7225.817872,94.132952,0.616653,351.606949,0.707998,2.599081,0.549079
std,5249.701658,1467.19715,12.376402,10.061653,0.030575,1502.006571,9.90625,0.104389,29.50062,0.06731,0.434836,0.497599
min,1.0,2522.0,74.133114,34.409894,0.676647,2579.0,56.666658,0.383239,197.015,0.17459,1.358128,0.0
25%,4547.0,5962.0,145.67591,51.393151,0.891617,6125.0,87.126656,0.53853,333.99,0.650962,2.208527,0.0
50%,9093.0,6660.0,153.88375,55.724288,0.923259,6843.0,92.085696,0.601194,353.088,0.701941,2.602966,1.0
75%,13639.0,8423.0,160.056214,70.156593,0.941372,8645.0,103.559146,0.695664,373.003,0.76928,2.964101,1.0
max,18185.0,10210.0,183.211434,82.550762,0.966774,11008.0,114.016559,0.886573,508.511,0.904748,3.911845,1.0


In [16]:
# checking the distribution of Target Varibale
data['Class'].value_counts()

1    9985
0    8200
Name: Class, dtype: int64

1 --> Jasmine

0 --> Gonen

In [17]:
data.groupby('Class').mean()

Unnamed: 0_level_0,id,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,13777.018659,8358.541341,153.698425,69.997435,0.888799,8575.295488,102.980173,0.651606,368.966422,0.769776,2.199598
1,5246.334702,5950.784777,150.023778,51.43984,0.937257,6117.583876,86.867333,0.587948,337.350798,0.657264,2.927149


In [31]:
data = data.drop('id', axis=1)

Separating the features and target

In [32]:
X = data.drop(columns='Class', axis=1)
Y = data['Class']

In [33]:
print(X)

       Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0      4537        92.229316        64.012769      0.719916        4677   
1      2872        74.691881        51.400454      0.725553        3015   
2      3048        76.293164        52.043491      0.731211        3132   
3      3073        77.033628        51.928487      0.738639        3157   
4      3693        85.124785        56.374021      0.749282        3802   
...     ...              ...              ...           ...         ...   
18180  5853       148.624571        51.029281      0.939210        6008   
18181  7585       169.593996        58.141659      0.939398        7806   
18182  6365       154.777085        52.908085      0.939760        6531   
18183  5960       151.397924        51.474600      0.940427        6189   
18184  6134       153.081981        51.590606      0.941500        6283   

       EquivDiameter    Extent  Perimeter  Roundness  AspectRation  
0          76.004525  0.657536

In [34]:
print(Y)

0        1
1        1
2        1
3        1
4        1
        ..
18180    0
18181    0
18182    0
18183    0
18184    0
Name: Class, Length: 18185, dtype: int64


Splitting the data into training data & Testing data

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [36]:
print(X.shape, X_train.shape, X_test.shape)

(18185, 10) (14548, 10) (3637, 10)


Model Training

Logistic Regression

In [37]:
model = LogisticRegression()

In [38]:
# training the Logistic Regression model using Training data

model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

Model Evaluation

Accuracy Score

In [39]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [40]:
print('Accuracy on training data = ', training_data_accuracy)

Accuracy on training data =  0.9886582348089085


In [41]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [42]:
print('Accuracy on test data = ', test_data_accuracy)

Accuracy on test data =  0.9881770690129228


Building a Predictive System

In [44]:
input_data = (3048,76.2931638,52.04349114,0.7312109273,3132,62.29634124,0.7591531756,210.012,0.8684335737,1.465950153)

# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for one datapoint
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The Rice is Gonen')

else:
  print('The Rice is Jasmine')



[1]
The Rice is Jasmine


