<a href="https://colab.research.google.com/github/Gazalapar/Data-Analytics-Lab/blob/main/LabAssignment3%264.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data Analytics  Lab Assignment 3**



**using cosine simlarity find the simlarity between two** **documents**

In [None]:
import os
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the path to the directory where your files are located
directory = '/content/drive/MyDrive/Colab Notebooks/TextDocs'  # Update if needed

# Initialize a list to store document contents
documents = []

# Read all text files from the directory
for i in range(1, 6):
    file_path = os.path.join(directory, f"Doc{i}.txt")  # Update to match filenames
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append(content)
    except FileNotFoundError:
        print(f"File not found: {file_path}")

# Proceed with text processing and similarity computation if documents are read successfully
if documents:
    # Preprocessing: Tokenization, removing punctuation, lowercasing
    def preprocess(doc):
        doc = re.sub(r'\W+', ' ', doc).lower()
        return doc

    processed_docs = [preprocess(doc) for doc in documents]

    # Vectorize the documents using TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(processed_docs)

    # Compute the cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(tfidf_matrix)

    # Create the upper triangular matrix for cosine similarities (excluding the diagonal)
    upper_triangular_matrix = np.triu(cosine_sim_matrix, k=1)

    print("Cosine Similarity Upper Triangular Matrix:")
    print(upper_triangular_matrix)

    # Find the maximum similarity and its indices
    max_similarity = np.max(upper_triangular_matrix)
    max_index = np.unravel_index(np.argmax(upper_triangular_matrix), upper_triangular_matrix.shape)

    doc_pair = (max_index[0] + 1, max_index[1] + 1)

    print(f"\nThe most similar pair of documents are Document {doc_pair[0]} and Document {doc_pair[1]} with a cosine similarity of {max_similarity:.4f}.")


Cosine Similarity Upper Triangular Matrix:
[[0.         0.02122381 0.04275781 0.0461145  0.04792865]
 [0.         0.         0.01931133 0.         0.01820505]
 [0.         0.         0.         0.         0.11002849]
 [0.         0.         0.         0.         0.06440101]
 [0.         0.         0.         0.         0.        ]]

The most similar pair of documents are Document 3 and Document 5 with a cosine similarity of 0.1100.


**Data Analytics Lab Assignment 4**

**Apriori Algorithm Implementation**

In [None]:
from itertools import combinations

# Function to calculate support for itemsets
def calculate_support(transactions, itemsets):
    support_count = {}
    for itemset in itemsets:
        for transaction in transactions:
            if set(itemset).issubset(set(transaction)):
                if itemset in support_count:
                    support_count[itemset] += 1
                else:
                    support_count[itemset] = 1
    return support_count

# Function to generate candidate itemsets of size k
def generate_new_combinations(frequent_itemsets, k):
    new_combinations = []
    frequent_itemsets_list = list(frequent_itemsets.keys())
    for i in range(len(frequent_itemsets_list)):
        for j in range(i+1, len(frequent_itemsets_list)):
            new_combination = tuple(sorted(set(frequent_itemsets_list[i]) | set(frequent_itemsets_list[j])))
            if len(new_combination) == k and new_combination not in new_combinations:
                new_combinations.append(new_combination)
    return new_combinations

# Apriori algorithm implementation
def apriori(transactions, min_support):
    # Step 1: Generate initial candidate itemsets (of size 1)
    single_items = set(item for transaction in transactions for item in transaction)
    itemsets = [(item,) for item in single_items]

    # Step 2: Calculate support and prune non-frequent itemsets
    support_count = calculate_support(transactions, itemsets)
    frequent_itemsets = {k: v for k, v in support_count.items() if v >= min_support}

    all_frequent_itemsets = {}
    k = 2

    while frequent_itemsets:
        all_frequent_itemsets.update(frequent_itemsets)

        # Step 3: Generate candidate itemsets of size k
        itemsets = generate_new_combinations(frequent_itemsets, k)

        # Step 4: Calculate support and prune non-frequent itemsets
        support_count = calculate_support(transactions, itemsets)
        frequent_itemsets = {k: v for k, v in support_count.items() if v >= min_support}

        k += 1

    return all_frequent_itemsets

# Example usage
transactions = [
    ['Bread', 'Milk'],
    ['Bread', 'Diaper', 'Beer', 'Eggs'],
    ['Milk', 'Diaper', 'Beer', 'Cola'],
    ['Bread', 'Milk', 'Diaper', 'Beer'],
    ['Bread', 'Milk', 'Cola']
]

min_support = 2
frequent_itemsets = apriori(transactions, min_support)
print("Frequent Itemsets:")
for itemset, support in frequent_itemsets.items():
    print(f"{itemset}: {support}")


Frequent Itemsets:
('Beer',): 3
('Cola',): 2
('Milk',): 4
('Diaper',): 3
('Bread',): 4
('Beer', 'Milk'): 2
('Beer', 'Diaper'): 3
('Beer', 'Bread'): 2
('Cola', 'Milk'): 2
('Diaper', 'Milk'): 2
('Bread', 'Milk'): 3
('Bread', 'Diaper'): 2
('Beer', 'Diaper', 'Milk'): 2
('Beer', 'Bread', 'Diaper'): 2


In [3]:
def apriori(transactions, min_support, max_support):
    from collections import defaultdict

    # Step 1: Count frequency of individual items (1-itemsets)
    def get_frequent_itemsets(itemsets, transactions, min_support, max_support):
        itemset_count = defaultdict(int)
        for transaction in transactions:
            for itemset in itemsets:
                if set(itemset).issubset(transaction):
                    itemset_count[itemset] += 1

        # Prune itemsets based on min_support and max_support
        frequent_itemsets = {}
        for itemset, count in itemset_count.items():
            if min_support <= count <= max_support:  # Properly filter by min and max support
                frequent_itemsets[itemset] = count

        return frequent_itemsets

    # Step 2: Generate candidate k-itemsets
    def generate_candidates(itemsets, k):
        candidates = set()
        length = len(itemsets)
        itemsets = list(itemsets)

        for i in range(length):
            for j in range(i+1, length):
                candidate = tuple(sorted(set(itemsets[i]).union(itemsets[j])))
                if len(candidate) == k:
                    candidates.add(candidate)
        return candidates

    # Initial step: Start with 1-itemsets
    itemsets = set()
    for transaction in transactions:
        for item in transaction:
            itemsets.add((item,))

    # List to store all frequent itemsets with their frequencies
    all_frequent_itemsets = {}

    k = 1
    while itemsets:
        # Get frequent itemsets for the current size k
        frequent_itemsets = get_frequent_itemsets(itemsets, transactions, min_support, max_support)

        if not frequent_itemsets:
            break

        all_frequent_itemsets.update(frequent_itemsets)

        # Generate candidates for the next size k+1
        itemsets = generate_candidates(frequent_itemsets, k + 1)
        k += 1

    return all_frequent_itemsets

'''# Example usage
transactions = [
    ['milk', 'bread', 'butter'],
    ['beer', 'bread'],
    ['milk', 'bread'],
    ['butter', 'bread'],
    ['milk', 'butter']
]
'''

transactions = [
    ['Bread', 'Milk'],
    ['Bread', 'Diaper', 'Beer', 'Eggs'],
    ['Milk', 'Diaper', 'Beer', 'Cola'],
    ['Bread', 'Milk', 'Diaper', 'Beer'],
    ['Bread', 'Milk', 'Cola']
]
min_support = 2
max_support = 3

frequent_itemsets = apriori(transactions, min_support, max_support)


for itemset, frequency in frequent_itemsets.items():
    print(f"Itemset: {itemset}, Frequency: {frequency}")


Itemset: ('Beer',), Frequency: 3
Itemset: ('Diaper',), Frequency: 3
Itemset: ('Cola',), Frequency: 2
Itemset: ('Beer', 'Diaper'), Frequency: 3
