In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib as plt

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("device using", device)

In [None]:
data_df = pd.read_csv("riceClassification.csv")
print(data_df.head())

   id  Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0   1  4537        92.229316        64.012769      0.719916        4677   
1   2  2872        74.691881        51.400454      0.725553        3015   
2   3  3048        76.293164        52.043491      0.731211        3132   
3   4  3073        77.033628        51.928487      0.738639        3157   
4   5  3693        85.124785        56.374021      0.749282        3802   

   EquivDiameter    Extent  Perimeter  Roundness  AspectRation  Class  
0      76.004525  0.657536    273.085   0.764510      1.440796      1  
1      60.471018  0.713009    208.317   0.831658      1.453137      1  
2      62.296341  0.759153    210.012   0.868434      1.465950      1  
3      62.551300  0.783529    210.657   0.870203      1.483456      1  
4      68.571668  0.769375    230.332   0.874743      1.510000      1  


In [3]:
print(data_df.isnull().sum())

id                 0
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
EquivDiameter      0
Extent             0
Perimeter          0
Roundness          0
AspectRation       0
Class              0
dtype: int64


In [None]:
data_df = data_df.dropna()

In [None]:
data_df.drop(["id"], axis=1, inplace=True)

In [8]:
print(data_df.shape)

(18185, 11)


In [None]:
print(data_df["Class"].unique())
print(data_df["Class"].value_counts())

[1 0]
Class
1    9985
0    8200
Name: count, dtype: int64


In [None]:
# normalization needed(a pre-processing technique):
# this is done to make the larger numerical values to be in a commaon range of value.
# we take each column and each value is divided by the maximum value in the column.

In [None]:
original_df = data_df.copy()

In [None]:
for cols in data_df.columns:
    data_df[cols] = data_df[cols] / data_df[cols].max()

In [12]:
print(data_df.head())

       Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0  0.444368         0.503404         0.775435      0.744658    0.424873   
1  0.281293         0.407681         0.622653      0.750489    0.273892   
2  0.298531         0.416421         0.630442      0.756341    0.284520   
3  0.300979         0.420463         0.629049      0.764024    0.286791   
4  0.361704         0.464626         0.682901      0.775033    0.345385   

   EquivDiameter    Extent  Perimeter  Roundness  AspectRation  Class  
0       0.666610  0.741661   0.537029   0.844997      0.368316    1.0  
1       0.530370  0.804230   0.409661   0.919215      0.371471    1.0  
2       0.546380  0.856278   0.412994   0.959862      0.374747    1.0  
3       0.548616  0.883772   0.414262   0.961818      0.379222    1.0  
4       0.601418  0.867808   0.452954   0.966836      0.386007    1.0  


In [13]:
X=np.array(data_df.iloc[:,:-1])
Y=np.array(data_df.iloc[:,-1])

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

In [15]:
X_test,X_val,y_test,y_val=train_test_split(X_test,y_test,test_size=0.5,random_state=0)

In [19]:
class dataset(Dataset):
    def __init__(self,X,Y):
        self.X=torch.tensor(X,dtype=torch.float32).to(device)
        self.Y=torch.tensor(Y,dtype=torch.float32).to(device)
    def __len__(self):
        return len(self.X)
    def __getitem__(self,idx):
        return self.X[idx],self.Y[idx]

In [20]:
training_data=dataset(X_train,y_train)
validation_data=dataset(X_val,y_val)
test_data=dataset(X_test,y_test)

In [21]:
training_dataloader=DataLoader(training_data,batch_size=8,shuffle=True)
validation_dataloader=DataLoader(validation_data,batch_size=8,shuffle=True)   
test_dataloader=DataLoader(test_data,batch_size=8,shuffle=True)

In [23]:
for x,y in training_dataloader:
    print(x)
    print("==========")
    print(y)
    break

tensor([[0.6209, 0.8163, 0.6639, 0.9624, 0.5888, 0.7879, 0.5690, 0.6691, 0.7605,
         0.6976],
        [0.5654, 0.7849, 0.6284, 0.9647, 0.5362, 0.7519, 0.7571, 0.6402, 0.7565,
         0.7087],
        [0.6981, 0.8756, 0.6947, 0.9660, 0.6656, 0.8355, 0.7259, 0.7164, 0.7460,
         0.7151],
        [0.8767, 0.9031, 0.8439, 0.9382, 0.8323, 0.9363, 0.6193, 0.7697, 0.8116,
         0.6071],
        [0.6259, 0.8520, 0.6470, 0.9719, 0.5989, 0.7911, 0.5363, 0.6859, 0.7295,
         0.7471],
        [0.6135, 0.7597, 0.7081, 0.9387, 0.5902, 0.7833, 0.6444, 0.6503, 0.7955,
         0.6087],
        [0.6751, 0.8940, 0.6653, 0.9745, 0.6434, 0.8217, 0.5382, 0.7165, 0.7212,
         0.7623],
        [0.5616, 0.8708, 0.5636, 0.9894, 0.5351, 0.7494, 0.7274, 0.6835, 0.6593,
         0.8766]], device='cuda:0')
tensor([1., 1., 1., 0., 1., 1., 1., 1.], device='cuda:0')
