<a href="https://colab.research.google.com/github/Hassan-Mahadjir/Machine-Learning/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import pandas as pd
import numpy as np
import re
import ast
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm # A library to show progress bars for loops

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
companies_df = pd.read_csv('ml_insurance_challenge.csv')
insurance_df = pd.read_csv('insurance_taxonomy - insurance_taxonomy.csv')

In [16]:
def parse_tags(x):
    if pd.isna(x): return ""
    if isinstance(x, (list, tuple)): return " ".join(x)
    try:
        parsed = ast.literal_eval(x)
        if isinstance(parsed, (list, tuple)):
            return " ".join(parsed)
    except Exception:
        pass
    return str(x)

companies_df['business_tags'] = companies_df['business_tags'].apply(parse_tags)



In [17]:
# Build a combined text for each company
def make_company_text(row):
    parts = [
        row.get('description') or "",
        row.get('business_tags_str') or "",
        row.get('sector') or "",
        row.get('category') or "",
        row.get('niche') or ""
    ]
    # Explicitly convert each part to string before stripping
    return " ".join([str(p).strip() for p in parts if p and str(p).strip()])

companies_df['text'] = companies_df.apply(make_company_text, axis=1)


In [21]:
# Preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower() # lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text) # remove special characters
    tokens = word_tokenize(text) # tokenize
    tokens = [word for word in tokens if word not in stop_words] # remove stopwords
    return ' '.join(tokens)

companies_df['processed_text'] = companies_df['text'].apply(preprocess_text)

In [32]:
companies_df.head()

Unnamed: 0,description,business_tags,sector,category,niche,business_tags_str,processed_text,text,id
0,Welchcivils is a civil engineering and constru...,Construction Services Multi-utilities Utility ...,Services,Civil Engineering Services,Other Heavy and Civil Engineering Construction,Construction Services Multi-utilities Utility ...,welchcivils civil engineering construction com...,Welchcivils is a civil engineering and constru...,0
1,"Kyoto Vegetable Specialists Uekamo, also known...",Wholesale Dual-task Movement Products Cast Iro...,Manufacturing,Fruit & Vegetable - Markets & Stores,"Frozen Fruit, Juice, and Vegetable Manufacturing",Wholesale Dual-task Movement Products Cast Iro...,kyoto vegetable specialists uekamo also known ...,"Kyoto Vegetable Specialists Uekamo, also known...",1
2,Loidholdhof Integrative Hofgemeinschaft is a c...,Living Forms Farm Cafe Fresh Coffee Community ...,Manufacturing,Farms & Agriculture Production,All Other Miscellaneous Crop Farming,Living Forms Farm Cafe Fresh Coffee Community ...,loidholdhof integrative hofgemeinschaft compan...,Loidholdhof Integrative Hofgemeinschaft is a c...,2
3,PATAGONIA Chapa Y Pintura is an auto body shop...,Automotive Body Repair Services Interior Repai...,Services,Auto Body Shops,"Automotive Body, Paint, and Interior Repair an...",Automotive Body Repair Services Interior Repai...,patagonia chapa pintura auto body shop located...,PATAGONIA Chapa Y Pintura is an auto body shop...,3
4,Stanica WODNA PTTK Swornegacie is a cultural e...,Cultural Activities Accommodation Services Kay...,Services,Boat Tours & Cruises,"Scenic and Sightseeing Transportation, Water",Cultural Activities Accommodation Services Kay...,stanica wodna pttk swornegacie cultural establ...,Stanica WODNA PTTK Swornegacie is a cultural e...,4


In [46]:
# Initialize a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer() # You can adjust max_features

# Fit and transform the processed text data
X = tfidf_vectorizer.fit_transform(companies_df['text'])

In [47]:
# Transform the insurance types using the *fitted* TF-IDF vectorizer
insurance_vectors = tfidf_vectorizer.transform(insurance_df['label'])

In [48]:
top_k = 3
all_recommendations = {}

# Convert sparse matrices to dense arrays if memory allows, for simpler indexing
X_dense = X.toarray()
insurance_vectors_dense = insurance_vectors.toarray()
insurance_types = insurance_df['label'].tolist()

for i, index in tqdm(enumerate(companies_df.index), total=len(companies_df)):
        # Calculate similarity scores for this company against all insurance types
    # Reshape X_dense[i] to a 2D array for cosine_similarity
    scores = cosine_similarity(X_dense[i].reshape(1, -1), insurance_vectors_dense)[0]

     # Get the indices of the top k highest scores
    top_indices = scores.argsort()[-top_k:][::-1]

    # Get the corresponding insurance types and their scores
    top_insurance_types = [insurance_types[j] for j in top_indices]
    top_scores = [scores[j] for j in top_indices]

    # Store the recommendations
    all_recommendations[index] = list(zip(top_insurance_types, top_scores))



100%|██████████| 9494/9494 [06:09<00:00, 25.69it/s]


In [51]:
first_company_id = companies_df['id'][1000]
print(f"Top 3 insurance recommendations for Company ID {first_company_id}:")
print(companies_df['text'][1000])
print(all_recommendations[first_company_id])

Top 3 insurance recommendations for Company ID 1000:
WDI is a company that specializes in designing, engineering, and manufacturing microscopy automation solutions for various industries, including Life Science, Biomedical Imaging, Machine Vision, and Electronics/Semiconductor Manufacturing. Their products range from compact autofocus sensors and automated microscopy modules to end-user IR imaging systems, including OEM components and complete optomechanical and optoelectronic sub-systems. WDI's mission is to be a global leader in the supply of advanced optical microscopy autofocal and automation solutions through technological innovation and enhancing customer productivity. The company's success is based on three principles: Precision, Focus, and Automation, with an emphasis on Precision in design, engineering. WSI's main focus is to deliver superior microscopy technology solutions, and their expertise in Automation technology and methods allows them to develop components and systems 