In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn import model_selection as ms

d_test = pd.read_excel(r"E:\DeepLearning\agriculture\test_agriculture.xlsx")
d_train = pd.read_csv(r"E:\DeepLearning\agriculture\train_agriculture.csv", encoding='latin1')

imp = SimpleImputer(missing_values=np.nan, strategy='mean')

x = d_train.iloc[:, 1:-1]
y = d_train.iloc[:, -1].to_numpy()

x_train, x_test, y_train, y_test = ms.train_test_split(x, y, test_size=0.3, random_state=42)


In [2]:
x_train

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
80255,850,0,1,2,60,27.0,6,1
54422,1296,0,0,2,5,10.0,20,3
70334,1296,0,1,2,20,34.0,17,1
32243,1212,0,1,2,20,16.0,27,2
69334,231,0,0,2,20,17.0,7,1
...,...,...,...,...,...,...,...,...
6265,2139,0,0,2,40,,8,3
54886,577,0,1,2,40,23.0,8,2
76820,2688,1,0,3,5,51.0,0,2
860,851,0,0,2,10,30.0,10,2


In [3]:
x_test

Unnamed: 0,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
53358,1898,1,0,2,25,31.0,5,2
56080,677,0,1,3,40,33.0,0,1
29914,2267,0,1,2,20,20.0,23,3
10054,2999,1,0,2,20,,8,3
80617,1898,0,1,2,40,40.0,9,1
...,...,...,...,...,...,...,...,...
31009,984,1,0,3,40,18.0,0,2
44424,1678,0,0,2,20,49.0,1,1
53186,851,1,0,2,5,6.0,13,2
64543,1132,0,0,2,30,,16,2


In [12]:
y_test

array([0, 2, 0, ..., 0, 0, 1])

In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, naive_bayes, neighbors, svm, tree, ensemble, neural_network, discriminant_analysis
from sklearn.metrics import accuracy_score

#20 model
models = {
    "LogisticRegression": linear_model.LogisticRegression(max_iter=500),
    "RidgeClassifier": linear_model.RidgeClassifier(),
    "SGDClassifier": linear_model.SGDClassifier(),
    
    "GaussianNB": naive_bayes.GaussianNB(),
    "MultinomialNB": naive_bayes.MultinomialNB(),
    "BernoulliNB": naive_bayes.BernoulliNB(),
    "CategoricalNB": naive_bayes.CategoricalNB(),
    
    "KNeighborsClassifier": neighbors.KNeighborsClassifier(),
    "RadiusNeighborsClassifier": neighbors.RadiusNeighborsClassifier(radius=10.0, outlier_label=-1),
    "NearestCentroid": neighbors.NearestCentroid(),
    
    "SVC": svm.SVC(),
    "LinearSVC": svm.LinearSVC(),
    "NuSVC": svm.NuSVC(),
    
    "DecisionTreeClassifier": tree.DecisionTreeClassifier(),
    "RandomForestClassifier": ensemble.RandomForestClassifier(),
    "ExtraTreesClassifier": ensemble.ExtraTreesClassifier(),
    "GradientBoostingClassifier": ensemble.GradientBoostingClassifier(),
    "HistGradientBoostingClassifier": ensemble.HistGradientBoostingClassifier(),
    "AdaBoostClassifier": ensemble.AdaBoostClassifier(),
    
    "MLPClassifier": neural_network.MLPClassifier(max_iter=500),
}


In [8]:
model = ensemble.RandomForestClassifier()

pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler(with_mean=False)),
    ("clf", model)
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
acc = accuracy_score(y_test, y_pred)

print(f"RandomForestClassifier -> {acc:.6f}")


RandomForestClassifier -> 0.824555


In [14]:
from sklearn import svm 
model = svm.SVC()

pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler(with_mean=False)),
    ("clf", model)
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
acc = accuracy_score(y_test, y_pred)

print(f"SVC -> {acc:.6f}")

SVC -> 0.839110


# Result của 20 models với bảng

In [11]:
results = {}
for name, model in models.items():
    try:
        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler(with_mean=False)),
            ("clf", model)
        ])
        pipe.fit(x_train, y_train)
        y_pred = pipe.predict(x_test)
        results[name] = accuracy_score(y_test, y_pred)
    except Exception:
        results[name] = None 

# In bảng kết quả 2 cột
df_results = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
print(df_results.to_string(index=False))


                         Model  Accuracy
            LogisticRegression  0.832658
               RidgeClassifier  0.834384
                 SGDClassifier  0.834871
                    GaussianNB  0.820017
                 MultinomialNB  0.834984
                   BernoulliNB  0.839110
                 CategoricalNB  0.826994
          KNeighborsClassifier  0.826994
     RadiusNeighborsClassifier  0.834871
               NearestCentroid  0.587553
                           SVC  0.839110
                     LinearSVC  0.834309
                         NuSVC       NaN
        DecisionTreeClassifier  0.744054
        RandomForestClassifier  0.824105
          ExtraTreesClassifier  0.814502
    GradientBoostingClassifier  0.843349
HistGradientBoostingClassifier  0.844474
            AdaBoostClassifier  0.835246
                 MLPClassifier  0.841436


In [15]:
print("RandomForestClassifier:", results.get("RandomForestClassifier", "ko có"))

RandomForestClassifier: 0.824105334233626
