<a href="https://colab.research.google.com/github/NamrathaGajulapalli/Myfirstproject/blob/main/machinelearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [12]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, r2_score
import warnings
warnings.filterwarnings('ignore')


In [13]:
# Load Datasets
# ----------------------------------------------------
iris = datasets.load_iris(as_frame=True)
iris_df = iris.frame
iris_df['target'] = iris.target

diabetes = datasets.load_diabetes(as_frame=True)
diab_df = diabetes.frame
diab_df['target'] = diabetes.target

print("Datasets Loaded:")
print("Iris shape:", iris_df.shape)
print("Diabetes shape:", diab_df.shape)

# ----------------------------------------------------


Datasets Loaded:
Iris shape: (150, 5)
Diabetes shape: (442, 11)


In [14]:
# 1. Central Tendency & Dispersion
# ----------------------------------------------------
print("\n1Ô∏è‚É£ Central Tendency & Dispersion (Iris):")
num = iris_df[iris.feature_names]
stats = pd.DataFrame({
    'Mean': num.mean(),
    'Median': num.median(),
    'Mode': num.mode().iloc[0],
    'Variance': num.var(),
    'StdDev': num.std()
})
print(stats.round(3))



1Ô∏è‚É£ Central Tendency & Dispersion (Iris):
                    Mean  Median  Mode  Variance  StdDev
sepal length (cm)  5.843    5.80   5.0     0.686   0.828
sepal width (cm)   3.057    3.00   3.0     0.190   0.436
petal length (cm)  3.758    4.35   1.4     3.116   1.765
petal width (cm)   1.199    1.30   0.2     0.581   0.762


In [15]:
#2. Pre-processing
# ----------------------------------------------------
print("\n2Ô∏è‚É£ Pre-processing Techniques:")

# (a) Attribute selection
X_clf = iris_df[iris.feature_names]
y_clf = iris_df['target']
selector = SelectKBest(score_func=f_classif, k=2)
selector.fit(X_clf, y_clf)
print("Selected features:", X_clf.columns[selector.get_support()].tolist())
# ‚Üí ['petal length (cm)', 'petal width (cm)']

# (b) Handling missing values
X_reg = diab_df.drop(columns=['target']).copy()
y_reg = diab_df['target']
rng = np.random.default_rng(0)
mask = rng.random(X_reg.shape) < 0.05
X_reg[mask] = np.nan
imp = SimpleImputer(strategy='mean')
X_reg = pd.DataFrame(imp.fit_transform(X_reg), columns=X_reg.columns)
print("Missing values handled using mean imputation ‚úî")

# (c) Discretization
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
iris_df['binned_sepal_length'] = discretizer.fit_transform(iris_df[['sepal length (cm)']])
print("Discretized sepal length into bins:", iris_df['binned_sepal_length'].unique())

# (d) Outlier elimination
feature = 'age'
Q1, Q3 = diab_df[feature].quantile([0.25, 0.75])
IQR = Q3 - Q1
mask = (diab_df[feature] >= Q1 - 1.5 * IQR) & (diab_df[feature] <= Q3 + 1.5 * IQR)
print("Outliers removed: kept", mask.sum(), "rows")

# ----------------------------------------------------



2Ô∏è‚É£ Pre-processing Techniques:
Selected features: ['petal length (cm)', 'petal width (cm)']
Missing values handled using mean imputation ‚úî
Discretized sepal length into bins: [0. 1. 2.]
Outliers removed: kept 442 rows


In [16]:
%whos


Variable                 Type                Data/Info
------------------------------------------------------
DecisionTreeClassifier   ABCMeta             <class 'sklearn.tree._cla<...>.DecisionTreeClassifier'>
DecisionTreeRegressor    ABCMeta             <class 'sklearn.tree._cla<...>s.DecisionTreeRegressor'>
GaussianMixture          ABCMeta             <class 'sklearn.mixture._<...>mixture.GaussianMixture'>
GaussianNB               ABCMeta             <class 'sklearn.naive_bayes.GaussianNB'>
GridSearchCV             ABCMeta             <class 'sklearn.model_sel<...>on._search.GridSearchCV'>
IQR                      float               0.07537517285865299
KBinsDiscretizer         type                <class 'sklearn.preproces<...>zation.KBinsDiscretizer'>
KMeans                   ABCMeta             <class 'sklearn.cluster._kmeans.KMeans'>
KNeighborsClassifier     ABCMeta             <class 'sklearn.neighbors<...>on.KNeighborsClassifier'>
KNeighborsRegressor      ABCMeta             <c

In [19]:


print("\n3Ô∏è‚É£ KNN Algorithm:")

# Classification
X_clftrain, X_clftest, y_clftrain, y_clftest = train_test_split(
    X_clf, y_clf, test_size=0.3, random_state=1
)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_clftrain, y_clftrain)

print("KNN Classification Accuracy:",
      round(accuracy_score(y_clftest, knn.predict(X_clftest)), 4))

# Regression
Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    X_reg, y_reg, test_size=0.3, random_state=1
)
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(Xr_train, yr_train)

rmse = np.sqrt(mean_squared_error(yr_test, knn_reg.predict(Xr_test)))
print("KNN Regression RMSE:", round(rmse, 4))



3Ô∏è‚É£ KNN Algorithm:
KNN Classification Accuracy: 0.9778
KNN Regression RMSE: 59.2403


In [20]:
# 4. Decision Tree Classification + Tuning
# ----------------------------------------------------
print("\n4Ô∏è‚É£ Decision Tree Classification with Tuning:")
params = {'max_depth': [2, 3, 4, None], 'min_samples_split': [2, 4, 6]}
dt = GridSearchCV(DecisionTreeClassifier(random_state=0), params, cv=3)
dt.fit(X_clftrain, y_clftrain)
print("Best Params:", dt.best_params_)
print("Accuracy:", round(accuracy_score(y_clftest, dt.predict(X_clftest)), 4))



4Ô∏è‚É£ Decision Tree Classification with Tuning:
Best Params: {'max_depth': 3, 'min_samples_split': 2}
Accuracy: 0.9556


In [26]:

# 5. Decision Tree Regression
# ----------------------------------------------------
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

print("\n5Ô∏è‚É£ Decision Tree Regression:")

dtr = DecisionTreeRegressor(max_depth=4, random_state=0)
dtr.fit(Xr_train, yr_train)

pred = dtr.predict(Xr_test)

# Compute RMSE manually (your sklearn version does not support squared=False)
mse = mean_squared_error(yr_test, pred)
rmse = np.sqrt(mse)

print("RMSE:", round(rmse, 4), "R2:", round(r2_score(yr_test, pred), 4))




5Ô∏è‚É£ Decision Tree Regression:
RMSE: 69.5493 R2: 0.0392


In [32]:
# 6. Random Forest
# ----------------------------------------------------
print("\n6Ô∏è‚É£ Random Forest:")
rf = RandomForestClassifier(n_estimators=50, random_state=0)
rf.fit(X_clftrain, y_clftrain)
print("Random Forest Accuracy:", round(accuracy_score(y_clftest, rf.predict(X_clftest)), 4))

# Random Forest Regression
rfr = RandomForestRegressor(n_estimators=50, random_state=0)
rfr.fit(Xr_train, yr_train)

pred_reg = rfr.predict(Xr_test)
rmse = np.sqrt(mean_squared_error(yr_test, pred_reg))

print("Random Forest RMSE:", round(rmse, 4))




6Ô∏è‚É£ Random Forest:
Random Forest Accuracy: 0.9556
Random Forest RMSE: 62.131


In [35]:
# 7. Naive Bayes
# ----------------------------------------------------
print("\n7Ô∏è‚É£ Naive Bayes:")
nb = GaussianNB()
nb.fit(X_clftrain, y_clftrain)
print("Naive Bayes Accuracy:", round(accuracy_score(y_clftest, nb.predict(X_clftest)), 4))



7Ô∏è‚É£ Naive Bayes:
Naive Bayes Accuracy: 0.9333


In [36]:
# 8. SVM Classification
# ----------------------------------------------------
print("\n8Ô∏è‚É£ Support Vector Machine:")
svc = SVC(kernel='rbf', C=1.0)
svc.fit(X_clftrain, y_clftrain)
print("SVM Accuracy:", round(accuracy_score(y_clftest, svc.predict(X_clftest)), 4))



8Ô∏è‚É£ Support Vector Machine:
SVM Accuracy: 0.9778


In [41]:
# 9. Simple Linear Regression
# ----------------------------------------------------
print("\n9Ô∏è‚É£ Simple Linear Regression:")

lr = LinearRegression()

# Use only the first column for simple linear regression
lr.fit(Xr_train[[Xr_train.columns[0]]], yr_train)

pred = lr.predict(Xr_test[[Xr_test.columns[0]]])

# Compute RMSE manually (no squared=False)
mse = mean_squared_error(yr_test, pred)
rmse = np.sqrt(mse)

print("Linear Regression RMSE:", round(rmse, 4))



9Ô∏è‚É£ Simple Linear Regression:
Linear Regression RMSE: 70.4145


In [42]:
# 10. Logistic Regression
# ----------------------------------------------------
print("\nüîü Logistic Regression:")
y_bin = (y_clf == 0).astype(int)
Xb_train, Xb_test, yb_train, yb_test = train_test_split(X_clf, y_bin, test_size=0.3, random_state=2)
log = LogisticRegression(max_iter=200)
log.fit(Xb_train, yb_train)
print("Logistic Regression Accuracy:", round(accuracy_score(yb_test, log.predict(Xb_test)), 4))



üîü Logistic Regression:
Logistic Regression Accuracy: 1.0


In [44]:
# 11. Multi-layer Perceptron
# ----------------------------------------------------
print("\n1Ô∏è‚É£1Ô∏è‚É£ Multi-layer Perceptron:")
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=0)
mlp.fit(X_clftrain, y_clftrain)
print("MLP Accuracy:", round(accuracy_score(y_clftest, mlp.predict(X_clftest)), 4))



1Ô∏è‚É£1Ô∏è‚É£ Multi-layer Perceptron:
MLP Accuracy: 1.0


In [45]:
# 12. K-Means
# ----------------------------------------------------
print("\n1Ô∏è‚É£2Ô∏è‚É£ K-Means Clustering:")
inertias = []
for k in range(1, 6):
    km = KMeans(n_clusters=k, random_state=0)
    km.fit(X_clf)
    inertias.append(km.inertia_)
print("K:", list(range(1, 6)))
print("Inertias:", [round(i, 2) for i in inertias])



1Ô∏è‚É£2Ô∏è‚É£ K-Means Clustering:
K: [1, 2, 3, 4, 5]
Inertias: [681.37, 152.35, 78.86, 57.23, 49.83]


In [46]:
# 13. Fuzzy C-Means (if skfuzzy available)
# ----------------------------------------------------
print("\n1Ô∏è‚É£3Ô∏è‚É£ Fuzzy C-Means:")
try:
    import skfuzzy as fuzz
    data = X_clf.values.T
    cntr, u, _, _, _, _, fpc = fuzz.cluster.cmeans(data, c=3, m=2, error=0.005, maxiter=1000)
    print("Fuzzy C-Means FPC:", round(fpc, 4))
except ImportError:
    print("skfuzzy not installed, skipping Fuzzy C-Means.")



1Ô∏è‚É£3Ô∏è‚É£ Fuzzy C-Means:
skfuzzy not installed, skipping Fuzzy C-Means.


In [47]:
# 14. Expectation Maximization (Gaussian Mixture)
# ----------------------------------------------------
print("\n1Ô∏è‚É£4Ô∏è‚É£ Expectation Maximization (GMM):")
gmm = GaussianMixture(n_components=3, random_state=0)
gmm.fit(X_clf)
labels = gmm.predict(X_clf)
centers = gmm.means_
sum_euc = sum(np.linalg.norm(x - centers[labels[i]]) for i, x in enumerate(X_clf.values))
print("Sum of Euclidean distances:", round(sum_euc, 3))



1Ô∏è‚É£4Ô∏è‚É£ Expectation Maximization (GMM):
Sum of Euclidean distances: 100.521
