In [2]:
from google.colab import drive
drive.mount('/content/drive')

  and should_run_async(code)


Mounted at /content/drive


# Data Preprocessing & Exploration


1.	Missing Value Handling:
○	Practical: Use techniques like mean imputation, median imputation, or predictive imputation to handle missing values in datasets.
○	Dataset: You can use the "Adult Income" dataset from the UCI Machine Learning Repository, which contains missing values in various attributes such as education and occupation. Dataset link


In [None]:
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url)


numeric_columns = data.select_dtypes(include=['int', 'float']).columns
data_mean = data.copy()
data_mean[numeric_columns] = data_mean[numeric_columns].fillna(data_mean[numeric_columns].mean())

numeric_columns = data.select_dtypes(include=['int', 'float']).columns
data_median = data.copy()
data_median[numeric_columns] = data_median[numeric_columns].fillna(data_median[numeric_columns].median())


from sklearn.impute import SimpleImputer

data_predictive = data.copy()
imp = SimpleImputer(strategy='most_frequent')
# data_predictive[['workclass', 'occupation', 'native-country']] = imp.fit_transform(data_predictive[['workclass', 'occupation', 'native-country']])

print("Mean imputation:")
print(data_mean.head())

print("\nMedian imputation:")
print(data_median.head())

print("\nPredictive imputation:")
print(data_predictive.head())

2.	Outlier Detection:
○	Practical: Identify and handle outliers in the data using methods like z-score, IQR (Interquartile Range), or visualization techniques.
○	Dataset: The "Credit Card Fraud Detection" dataset from Kaggle contains transactions with potential outliers representing fraudulent activities. Dataset link


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
# change the 'credit_card_fraud_dataset.csv' to url/csv file
df = pd.read_csv('/content/creditcard.csv')

# Z-Score method
z_scores = np.abs((df - df.mean()) / df.std())
threshold = 3
outliers_zscore = df[(z_scores > threshold).any(axis=1)]
# print('Z-Score method')
# print(outliers_zscore)

# IQR method
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_iqr = df[((df < lower_bound) | (df > upper_bound)).any(axis=1)]
# print("IQR METHOD")
# print(outliers_iqr)

# Visualization - Boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(data=df)
plt.title('Boxplot of Credit Card Fraud Dataset')
plt.xticks(rotation=45)
plt.show()


3.	Feature Scaling:
○	Practical: Normalize or standardize features in the dataset to ensure fair comparisons and improve machine learning model performance.
○	Dataset: The "Wine Quality" dataset from the UCI Machine Learning Repository includes features related to wine properties that can benefit from feature scaling. Dataset link

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
df = pd.read_csv(url, sep=';')

# Display the first few rows of the dataset
print(df.head())

# Separate features (attributes) from the target variable
X = df.drop('quality', axis=1)  # Features
y = df['quality']  # Target variable

# Standardization
scaler_standard = StandardScaler()
X_standardized = scaler_standard.fit_transform(X)

# Normalization
scaler_minmax = MinMaxScaler()
X_normalized = scaler_minmax.fit_transform(X)

# Convert the standardized and normalized arrays back to DataFrame for display
X_standardized_df = pd.DataFrame(X_standardized, columns=X.columns)
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)

# Display the first few rows of the standardized and normalized datasets
print("\nStandardized Data:")
print(X_standardized_df.head())

print("\nNormalized Data:")
print(X_normalized_df.head())


4.	Data Visualization:
○	Practical: Explore the dataset visually using plots such as histograms, scatter plots, and box plots to understand data distributions and relationships.
○	Dataset: The "Iris" dataset is a classic dataset often used for data visualization tasks, showcasing features of different iris flower species. Dataset link


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df = pd.read_csv(url, names=column_names)

# Display the first few rows of the dataset
# print(df.head())

# Histograms for feature distributions
plt.figure(figsize=(10, 6))
df.hist(figsize=(10, 8))
# plt.tight_layout()
# plt.show()

# Pairplot for pairwise relationships and distributions
plt.figure(figsize=(10, 8))
sns.pairplot(df, hue='species')
# plt.show()

# Box plot for feature distributions by species
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='species', y='sepal_length')
plt.title('Sepal Length Distribution by Species')
# plt.show()


# Association Rule Mining - Apriori algorithm

1.	Frequent Itemset Mining:
○	Experiment: Use the Apriori algorithm to mine frequent itemsets from transactional data.
○	Dataset: The "Online Retail" dataset from the UCI Machine Learning Repository contains transactional data from an online retail store, suitable for frequent itemset mining. Dataset link



In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
df = pd.read_excel(url)
df.head()
# Data Preprocessing
# Remove rows with missing values in 'InvoiceNo' column
df.dropna(subset=['InvoiceNo'], inplace=True)

# Remove credit transactions (starting with 'C')
# df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# Remove unnecessary columns
# df = df[['InvoiceNo', 'Description']]

# Group items by invoice and create transactional dataset
transactions = df.groupby('InvoiceNo')['Description'].apply(list)

# Convert transactions into one-hot encoded format
one_hot_encoded = transactions.str.join('|').str.get_dummies()

# Apply the Apriori algorithm
frequent_itemsets = apriori(one_hot_encoded, min_support=0.01, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the frequent itemsets and association rules
print("Frequent Itemsets:")
print(frequent_itemsets)

print("\nAssociation Rules:")
print(rules)




2.	Association Rule Generation:
○	Experiment: Generate association rules with specified support and confidence thresholds from the mined frequent itemsets.
○	Dataset: The "Groceries" dataset from the UCI Machine LearningRepository includes transactional data from a grocery store, ideal for association rule generation tasks. Dataset link



In [7]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
url = "/content/drive/MyDrive/Colab Notebooks/DATASETS/Groceries_dataset.csv"
df = pd.read_csv(url)

# Data Preprocessing
transactions = []
for index, row in df.iterrows():
    transactions.append([item.strip() for item in row if pd.notna(item)])

# Apply the Apriori algorithm
one_hot_encoded = pd.get_dummies(pd.DataFrame(transactions).stack()).groupby(level=0).sum()
frequent_itemsets = apriori(one_hot_encoded, min_support=0.01, use_colnames=True)

# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5)
rules = rules[(rules['support'] >= 0.01) & (rules['confidence'] >= 0.5)]

# Display the association rules
print("Association Rules:")
print(rules)


  and should_run_async(code)


AttributeError: 'int' object has no attribute 'strip'

3.	Rule Evaluation and Pruning:
○	Experiment: Evaluate generated association rules based on metrics like lift, Support & confidence. Prune rules based on predefined criteria.
○	Dataset: The "Mushroom" dataset from the UCI Machine Learning Repository contains data about mushroom species, suitable for association rule evaluation and pruning. Dataset link


In [3]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
# Define column names based on dataset description
column_names = ["class", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment",
                "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root",
                "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring",
                "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type",
                "spore-print-color", "population", "habitat"]
df = pd.read_csv(url, header=None, names=column_names)

# Data Preprocessing
# Convert categorical variables to dummy variables
df_encoded = pd.get_dummies(df)

# Apply the Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.3, use_colnames=True)

# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

# Evaluate Association Rules
print("Association Rules:")
print(rules)

# Prune Rules based on predefined criteria
min_confidence = 0.7
min_lift = 1.5
pruned_rules = rules[(rules['confidence'] >= min_confidence) & (rules['lift'] >= min_lift)]

# Display pruned rules
print("\nPruned Rules:")
print(pruned_rules)


  and should_run_async(code)


Association Rules:
                          antecedents  \
0                           (class_e)   
1                         (bruises_t)   
2                            (odor_n)   
3                           (class_e)   
4                       (gill-size_b)   
...                               ...   
62737  (gill-attachment_f, bruises_t)   
62738                   (ring-type_p)   
62739    (stalk-surface-below-ring_s)   
62740    (stalk-surface-above-ring_s)   
62741                     (bruises_t)   

                                             consequents  antecedent support  \
0                                            (bruises_t)            0.517971   
1                                              (class_e)            0.415559   
2                                              (class_e)            0.434269   
3                                               (odor_n)            0.517971   
4                                              (class_e)            0.690793   
...     

4.	Rule Visualization and Interpretation:
○	Experiment: Visualize the generated association rules using graphs or charts for better understanding and interpretation.
○	Dataset: The "Market Basket Optimisation" dataset from Kaggle consists of transactional data from a grocery store, providing opportunities for rule visualization and interpretation


In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import networkx as nx
import matplotlib.pyplot as plt

# Load the dataset maam ne nhi diya hai lol dumb-fuck -kawal
url = "link"
df = pd.read_csv(url)

# Data Preprocessing
# Convert data into transaction format
transactions = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index(name='items')

# One-hot encode the data
one_hot_encoded = transactions['items'].str.join('|').str.get_dummies('|')

# Apply the Apriori algorithm
frequent_itemsets = apriori(one_hot_encoded, min_support=0.01, use_colnames=True)

# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Visualize Association Rules using Network Graph
G = nx.DiGraph()
for idx, rule in rules.iterrows():
    G.add_edge(rule['antecedents'], rule['consequents'], weight=rule['lift'])

plt.figure(figsize=(15, 10))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, node_size=2000)
nx.draw_networkx_labels(G, pos, font_size=10)
edges = nx.draw_networkx_edges(G, pos, arrowstyle='->', arrowsize=20, edge_color='b', width=2)
plt.title("Association Rules Network Graph")
plt.show()


# Classification: Naive Bayes Algorithm

1.	Spam Email Classification:
○	Practical: Train a Naive Bayes classifier to distinguish between spam and non-spam emails based on text features.
○	Dataset: The "Spambase" dataset from the UCI Machine Learning Repository contains email spam and non-spam data, ideal for spam classification tasks. Dataset link


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
df = pd.read_csv(url, header=None)

# Determine the number of columns in the dataset
num_columns = df.shape[1]

# Generate generic column names
column_names = [f"X{i}" for i in range(num_columns - 1)] + ["spam"]

# Assign column names to the DataFrame
df.columns = column_names

# Data Preprocessing
X = df.drop("spam", axis=1)  # Features
y = df["spam"]  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# #Clustering: K means algorithm

1.	Customer Segmentation:
○	Practical: Use K-means clustering to segment customers based on their purchasing behavior and demographics.
○	Dataset: The "Mall Customer Segmentation Data" from Kaggle contains information about customers such as age, income, and spending score, suitable for customer segmentation tasks. Dataset link


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load the customer data
customer_data = pd.read_csv(r'/content/drive/MyDrive/dataset /dataset/Mall_Customers.csv')  # Replace 'customer_data.csv' with your dataset path

# Print the columns to identify the correct column names
print(customer_data.columns)

# Preprocess the data
# For illustration purposes, let's assume the data is already preprocessed and scaled
# If needed, handle missing values, encode categorical variables, and scale the features

# Select relevant features for clustering
features = customer_data[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Choose the number of clusters
num_clusters = 4  # For illustration, you can determine the optimal number of clusters using techniques like the elbow method

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(scaled_features)

# Add cluster labels to the original data
customer_data['Cluster'] = kmeans.labels_
# Label encoding for categorical variable 'Gender'
customer_data['Gender'] = customer_data['Gender'].astype('category').cat.codes


# Visualize the clusters
plt.figure(figsize=(8, 6))
for cluster in range(num_clusters):
    cluster_data = customer_data[customer_data['Cluster'] == cluster]
    plt.scatter(cluster_data['Annual Income (k$)'], cluster_data['Spending Score (1-100)'], label=f'Cluster {cluster}')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Customer Segmentation')
plt.legend()
plt.grid(True)
plt.show()

# Profile each cluster
cluster_profiles = customer_data.groupby('Cluster').mean()
print("Cluster Profiles:")
print(cluster_profiles)


2.	Image Compression:
○	Practical: Apply K-means clustering to compress images by reducing the number of colors while preserving image quality.
○	Dataset: You can use images from public repositories or create your own dataset of images for image compression experiments.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from skimage import io

# Load the image
image = io.imread('/content/drive/MyDrive/Screenshot_2020-09-07-18-03-30-723_com.tencent.ig.jpg')  # Replace 'your_image.jpg' with the path to your image

# Reshape the image into a 2D array
w, h, d = original_shape = tuple(image.shape)
image_array = np.reshape(image, (w * h, d))

# Apply K-means clustering
n_colors = 16  # Number of colors to reduce to
kmeans = KMeans(n_clusters=n_colors, random_state=0)
kmeans.fit(image_array)

# Predict the cluster labels for each pixel
labels = kmeans.predict(image_array)

# Replace each pixel with its centroid value
compressed_image = np.zeros((w, h, d), dtype=np.uint8)
label_idx = 0
for i in range(w):
    for j in range(h):
        compressed_image[i][j] = kmeans.cluster_centers_[labels[label_idx]]
        label_idx += 1

# Display the original and compressed images
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
axes[0].imshow(image)
axes[0].set_title('Original Image')
axes[0].axis('off')
axes[1].imshow(compressed_image)
axes[1].set_title('Compressed Image ({} colors)'.format(n_colors))
axes[1].axis('off')
plt.show()
