**01.Import the numpy and pandas libraries**

In [1]:
import numpy as np
import pandas as pd

**02.Load the data set ls-01**

In [4]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('ls-01.csv')
df.head()

Saving ls-01.csv to ls-01 (1).csv


Unnamed: 0,Age,Gender,Region,Income,Married,Children,Car,Mortgage,Pep
0,48,,INNER_CITY,17546.0,NO,YES,NO,NO,YES
1,40,MALE,TOWN,30085.1,YES,YES,YES,YES,NO
2,51,FEMALE,INNER_CITY,16575.4,YES,NO,YES,NO,NO
3,23,FEMALE,TOWN,20375.4,YES,YES,NO,NO,NO
4,57,FEMALE,RURAL,50576.3,YES,NO,NO,NO,NO


**03.Create the two copies of the data set**

In [5]:
df_copy1 = df.copy()
df_copy2 = df.copy()

**04.Check the null values**

In [14]:
# prompt: Check the null values

df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,49
Region,0
Income,0
Married,0
Children,0
Car,0
Mortgage,0
Pep,0


**05.Missing data handling**

**06.Remove the missing values**

In [15]:
# prompt: 05.Missing data handling
# 06.Remove the missing values

# Removing rows with missing values
df_copy1.dropna(inplace=True)

# Checking the shape after removing missing values
print("Shape after removing missing values:", df_copy1.shape)

# Displaying the first few rows after removing missing values
print("\nDataFrame after removing missing values:")
print(df_copy1.head())

Shape after removing missing values: (237, 9)

DataFrame after removing missing values:
   Age  Gender      Region   Income Married Children  Car Mortgage  Pep
1   40    MALE        TOWN  30085.1     YES      YES  YES      YES   NO
2   51  FEMALE  INNER_CITY  16575.4     YES       NO  YES       NO   NO
3   23  FEMALE        TOWN  20375.4     YES      YES   NO       NO   NO
4   57  FEMALE       RURAL  50576.3     YES       NO   NO       NO   NO
5   57  FEMALE        TOWN  37869.6     YES      YES   NO       NO  YES


**07.Filling the missing values**

In [17]:
# 07.Filling the missing values

# Fill numeric columns with mean
df_filled_mean = df.copy()
numeric_cols = df_filled_mean.select_dtypes(include=np.number).columns
df_filled_mean[numeric_cols] = df_filled_mean[numeric_cols].fillna(df_filled_mean[numeric_cols].mean())
print("Missing values after filling with mean:")
print(df_filled_mean.isnull().sum())

# Fill numeric columns with median
df_filled_median = df.copy()
numeric_cols = df_filled_median.select_dtypes(include=np.number).columns
df_filled_median[numeric_cols] = df_filled_median[numeric_cols].fillna(df_filled_median[numeric_cols].median())
print("\nMissing values after filling with median:")
print(df_filled_median.isnull().sum())

# Fill object type columns with mode
df_filled_mode = df.copy()
object_cols = df_filled_mode.select_dtypes(include='object').columns
df_filled_mode[object_cols] = df_filled_mode[object_cols].fillna(df_filled_mode[object_cols].mode().iloc[0])
print("\nMissing values after filling with mode:")
print(df_filled_mode.isnull().sum())

Missing values after filling with mean:
Age          0
Gender      49
Region       0
Income       0
Married      0
Children     0
Car          0
Mortgage     0
Pep          0
dtype: int64

Missing values after filling with median:
Age          0
Gender      49
Region       0
Income       0
Married      0
Children     0
Car          0
Mortgage     0
Pep          0
dtype: int64

Missing values after filling with mode:
Age         0
Gender      0
Region      0
Income      0
Married     0
Children    0
Car         0
Mortgage    0
Pep         0
dtype: int64


**08.Group the data**

In [18]:
# prompt: 08.Group the data

# Assuming you want to group by a specific column and calculate some aggregate
# Replace 'column_to_group_by' with the actual column name you want to group by
# Replace 'column_to_aggregate' with the column you want to aggregate
# Replace 'mean()' with the aggregation function you want (e.g., sum(), count(), size(), etc.)

# Example 1: Group by 'Category' column and calculate the mean of 'Value' column
if 'Category' in df.columns and 'Value' in df.columns:
    grouped_data = df.groupby('Category')['Value'].mean()
    print("\nGrouped Data (mean of 'Value' by 'Category'):")
    print(grouped_data)
else:
    print("\nSkipping grouping as 'Category' or 'Value' column not found in the DataFrame.")

# Example 2: Group by multiple columns and calculate the size of each group
# Replace 'Column1' and 'Column2' with the actual column names
if 'Column1' in df.columns and 'Column2' in df.columns:
    grouped_size = df.groupby(['Column1', 'Column2']).size().reset_index(name='Count')
    print("\nGrouped Data (count by 'Column1' and 'Column2'):")
    print(grouped_size)
else:
    print("\nSkipping grouping as 'Column1' or 'Column2' column not found in the DataFrame.")

# Example 3: Group by a column and apply multiple aggregation functions
if 'Category' in df.columns and 'Value' in df.columns and 'Another_Value' in df.columns:
    grouped_agg = df.groupby('Category').agg({
        'Value': 'mean',
        'Another_Value': ['sum', 'count']
    })
    print("\nGrouped Data with multiple aggregations:")
    print(grouped_agg)
else:
     print("\nSkipping grouping as required columns for multiple aggregations not found in the DataFrame.")

# You can choose and modify one of the examples above based on your specific grouping needs.
# If you don't know the column names, you can print the column names first:
# print(df.columns)


Skipping grouping as 'Category' or 'Value' column not found in the DataFrame.

Skipping grouping as 'Column1' or 'Column2' column not found in the DataFrame.

Skipping grouping as required columns for multiple aggregations not found in the DataFrame.


**09.Label the missing values**

In [19]:
# prompt: Label the missing values

# Replacing missing values with a specific label, e.g., 'Missing'
# This is often done for categorical or object type columns
df_labeled = df.copy()

# Select object type columns to label missing values
object_cols_to_label = df_labeled.select_dtypes(include='object').columns

# Replace NaN in selected object columns with 'Missing'
df_labeled[object_cols_to_label] = df_labeled[object_cols_to_label].fillna('Missing')

print("\nMissing values after labeling:")
print(df_labeled.isnull().sum())

# You can also label missing numeric values if needed, but it's less common.
# If you want to label numeric missing values, you might choose a value
# that is outside the expected range or a specific marker value.
# For example, replacing with -999:
# numeric_cols_to_label = df_labeled.select_dtypes(include=np.number).columns
# df_labeled[numeric_cols_to_label] = df_labeled[numeric_cols_to_label].fillna(-999)
# print("\nMissing values after labeling (numeric):")
# print(df_labeled.isnull().sum())

# Displaying the first few rows after labeling missing values
print("\nDataFrame after labeling missing values:")
print(df_labeled.head())


Missing values after labeling:
Age         0
Gender      0
Region      0
Income      0
Married     0
Children    0
Car         0
Mortgage    0
Pep         0
dtype: int64

DataFrame after labeling missing values:
   Age   Gender      Region   Income Married Children  Car Mortgage  Pep
0   48  Missing  INNER_CITY  17546.0      NO      YES   NO       NO  YES
1   40     MALE        TOWN  30085.1     YES      YES  YES      YES   NO
2   51   FEMALE  INNER_CITY  16575.4     YES       NO  YES       NO   NO
3   23   FEMALE        TOWN  20375.4     YES      YES   NO       NO   NO
4   57   FEMALE       RURAL  50576.3     YES       NO   NO       NO   NO


**10.Group the data according to the gender**

In [20]:
# prompt: Group the data according to the gender

# 10.Group the data according to the gender

# Assuming 'Gender' is the column containing gender information
if 'Gender' in df.columns:
    grouped_by_gender = df.groupby('Gender')
    print("\nGrouped Data by Gender:")

    # You can now perform various operations on this grouped object
    # For example, to see the size of each group:
    print("\nSize of each gender group:")
    print(grouped_by_gender.size())

    # To calculate the mean of numerical columns for each gender:
    print("\nMean of numerical columns for each gender:")
    print(grouped_by_gender.mean(numeric_only=True))

    # To get the first row of each group:
    # print("\nFirst row of each gender group:")
    # print(grouped_by_gender.first())

    # To iterate through the groups:
    # print("\nIterating through gender groups:")
    # for gender, group_df in grouped_by_gender:
    #     print(f"\nGender: {gender}")
    #     print(group_df.head())

else:
    print("\nSkipping grouping by gender as 'Gender' column not found in the DataFrame.")


Grouped Data by Gender:

Size of each gender group:
Gender
FEMALE    122
MALE      115
dtype: int64

Mean of numerical columns for each gender:
              Age        Income
Gender                         
FEMALE  44.057377  28416.321639
MALE    39.843478  26327.131565


**11.Import the data set as**

Dataset=[['milk', 'onion', 'nugmeg', 'kidney beans', 'eggs', 'yogurt'],

['oill', 'onion', 'nutmeg', 'kidney beans', 'eggs', 'yogurt'],

['milk', 'apple', 'kidney beans', 'eggs'],

['milk', 'unicorn', 'corn', 'kidney beans', 'yogurt']

['corn', 'onion', 'onion', 'kidney beans', 'ice cream', 'eggs']]

In [21]:
Dataset=[['milk', 'onion', 'nugmeg', 'kidney beans', 'eggs', 'yogurt'],
['oill', 'onion', 'nutmeg', 'kidney beans', 'eggs', 'yogurt'],
['milk', 'apple', 'kidney beans', 'eggs'],
['milk', 'unicorn', 'corn', 'kidney beans', 'yogurt'],
['corn', 'onion', 'onion', 'kidney beans', 'ice cream', 'eggs']]

print("\nImported Dataset:")
Dataset


Imported Dataset:


[['milk', 'onion', 'nugmeg', 'kidney beans', 'eggs', 'yogurt'],
 ['oill', 'onion', 'nutmeg', 'kidney beans', 'eggs', 'yogurt'],
 ['milk', 'apple', 'kidney beans', 'eggs'],
 ['milk', 'unicorn', 'corn', 'kidney beans', 'yogurt'],
 ['corn', 'onion', 'onion', 'kidney beans', 'ice cream', 'eggs']]

**12.Import the pandas and mlxtend libraries**

In [22]:
# prompt: 12.Import the pandas and mlxtend libraries

!pip install mlxtend
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder



**13.Create the dataset as a dataframe**

In [23]:
# Initialize the TransactionEncoder
te = TransactionEncoder()

# Fit and transform the Dataset
te_ary = te.fit(Dataset).transform(Dataset)

# Create a DataFrame from the transformed data
df = pd.DataFrame(te_ary, columns=te.columns_)

print("\nDataset as a DataFrame (one-hot encoded):")
df


Dataset as a DataFrame (one-hot encoded):


Unnamed: 0,apple,corn,eggs,ice cream,kidney beans,milk,nugmeg,nutmeg,oill,onion,unicorn,yogurt
0,False,False,True,False,True,True,True,False,False,True,False,True
1,False,False,True,False,True,False,False,True,True,True,False,True
2,True,False,True,False,True,True,False,False,False,False,False,False
3,False,True,False,False,True,True,False,False,False,False,True,True
4,False,True,True,True,True,False,False,False,False,True,False,False


**14.Create the possible item set for the min_support=0.6 using apriori algorithm**

In [24]:
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)

print("\nFrequent Itemsets (min_support = 0.6):")
frequent_itemsets


Frequent Itemsets (min_support = 0.6):


Unnamed: 0,support,itemsets
0,0.8,(eggs)
1,1.0,(kidney beans)
2,0.6,(milk)
3,0.6,(onion)
4,0.6,(yogurt)
5,0.8,"(kidney beans, eggs)"
6,0.6,"(onion, eggs)"
7,0.6,"(kidney beans, milk)"
8,0.6,"(kidney beans, onion)"
9,0.6,"(kidney beans, yogurt)"


**15.Create the association rule mining**

In [25]:
# prompt: Create the association rule mining

from mlxtend.frequent_patterns import association_rules

# Generate association rules with a minimum confidence of 0.7
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

print("\nAssociation Rules (min_confidence = 0.7):")
rules


Association Rules (min_confidence = 0.7):


  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(kidney beans),(eggs),1.0,0.8,0.8,0.8,1.0,1.0,0.0,1.0,0.0,0.8,0.0,0.9
1,(eggs),(kidney beans),0.8,1.0,0.8,1.0,1.0,1.0,0.0,inf,0.0,0.8,0.0,0.9
2,(onion),(eggs),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
3,(eggs),(onion),0.8,0.6,0.6,0.75,1.25,1.0,0.12,1.6,1.0,0.75,0.375,0.875
4,(milk),(kidney beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
5,(onion),(kidney beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
6,(yogurt),(kidney beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
7,"(kidney beans, onion)",(eggs),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
8,"(kidney beans, eggs)",(onion),0.8,0.6,0.6,0.75,1.25,1.0,0.12,1.6,1.0,0.75,0.375,0.875
9,"(onion, eggs)",(kidney beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8


**16.Create the sub set of antecedents,consequents,support,confidence,lif**t

In [26]:
# prompt: Create the sub set of antecedents,consequents,support,confidence,lift

# Select the desired columns
subset_rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

print("\nSubset of Association Rules (antecedents, consequents, support, confidence, lift):")
subset_rules



Subset of Association Rules (antecedents, consequents, support, confidence, lift):


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(kidney beans),(eggs),0.8,0.8,1.0
1,(eggs),(kidney beans),0.8,1.0,1.0
2,(onion),(eggs),0.6,1.0,1.25
3,(eggs),(onion),0.6,0.75,1.25
4,(milk),(kidney beans),0.6,1.0,1.0
5,(onion),(kidney beans),0.6,1.0,1.0
6,(yogurt),(kidney beans),0.6,1.0,1.0
7,"(kidney beans, onion)",(eggs),0.6,1.0,1.25
8,"(kidney beans, eggs)",(onion),0.6,0.75,1.25
9,"(onion, eggs)",(kidney beans),0.6,1.0,1.0


**17.Retrieve the itemset which itemset have confidence >= 1**

In [27]:
# prompt: Retrieve the itemset which itemset have confidence >= 1

# Filter the rules DataFrame to get itemsets with confidence >= 1
high_confidence_rules = rules[rules['confidence'] >= 1]

print("\nItemsets with Confidence >= 1:")
high_confidence_rules


Itemsets with Confidence >= 1:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1,(eggs),(kidney beans),0.8,1.0,0.8,1.0,1.0,1.0,0.0,inf,0.0,0.8,0.0,0.9
2,(onion),(eggs),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
4,(milk),(kidney beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
5,(onion),(kidney beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
6,(yogurt),(kidney beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
7,"(kidney beans, onion)",(eggs),0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
9,"(onion, eggs)",(kidney beans),0.6,1.0,0.6,1.0,1.0,1.0,0.0,inf,0.0,0.6,0.0,0.8
10,(onion),"(kidney beans, eggs)",0.6,0.8,0.6,1.0,1.25,1.0,0.12,inf,0.5,0.75,1.0,0.875
