In [1]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /opt/conda/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /opt/conda/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /opt/conda/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /opt/conda/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /opt/conda/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /opt/conda/n

[nltk_data]    | Downloading package opinion_lexicon to
[nltk_data]    |     /opt/conda/nltk_data...
[nltk_data]    |   Unzipping corpora/opinion_lexicon.zip.
[nltk_data]    | Downloading package panlex_swadesh to
[nltk_data]    |     /opt/conda/nltk_data...
[nltk_data]    | Downloading package paradigms to
[nltk_data]    |     /opt/conda/nltk_data...
[nltk_data]    |   Unzipping corpora/paradigms.zip.
[nltk_data]    | Downloading package pe08 to /opt/conda/nltk_data...
[nltk_data]    |   Unzipping corpora/pe08.zip.
[nltk_data]    | Downloading package perluniprops to
[nltk_data]    |     /opt/conda/nltk_data...
[nltk_data]    |   Unzipping misc/perluniprops.zip.
[nltk_data]    | Downloading package pil to /opt/conda/nltk_data...
[nltk_data]    |   Unzipping corpora/pil.zip.
[nltk_data]    | Downloading package pl196x to /opt/conda/nltk_data...
[nltk_data]    |   Unzipping corpora/pl196x.zip.
[nltk_data]    | Downloading package porter_test to
[nltk_data]    |     /opt/conda/nltk_data.

True

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
import string
import pandas as pd

# Initial Company Dataset
Our dataset imported with LLM summaries

In [3]:
df = pd.read_csv('company_summaries.csv')
display(df)

Unnamed: 0.1,Unnamed: 0,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate,summary
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906,IBM is a technology and consulting company tha...
1,2309813,us army,goarmy.com,1800.0,military,10001+,"alexandria, virginia, united states",united states,linkedin.com/company/us-army,162163,445958,The U.S. Army provides diverse career opportun...
2,2959148,cognizant technology solutions,cognizant.com,1994.0,information technology and services,10001+,"teaneck, new jersey, united states",united states,linkedin.com/company/cognizant,122031,210020,Cognizant helps businesses modernize technolog...
3,5944912,walmart,walmartcareers.com,1962.0,retail,10001+,"withee, wisconsin, united states",united states,linkedin.com/company/walmart,120753,272827,Walmart is a global retailer and employer offe...
4,3300741,at&t,att.com,1876.0,telecommunications,10001+,"dallas, texas, united states",united states,linkedin.com/company/at&t,115188,269659,AT&T is a telecommunications company offering ...
...,...,...,...,...,...,...,...,...,...,...,...,...
19800,3278172,bush industries,bush.co,1957.0,furniture,201 - 500,"jamestown, new york, united states",united states,linkedin.com/company/bush-industries,106,411,"Okay, I'm ready. Please provide the website UR..."
19801,2386412,texas a&m foundation,txamfoundation.com,1953.0,philanthropy,201 - 500,"college station, texas, united states",united states,linkedin.com/company/texas-a&m-foundation,106,198,"The Texas A&M Foundation, established in 1953,..."
19802,4378167,lexicon relocation,lexiconrelocation.com,1993.0,real estate,201 - 500,"jacksonville, florida, united states",united states,linkedin.com/company/lexicon-relocation,106,217,Sterling Lexicon is a global relocation manage...
19803,4613124,honor,joinhonor.com,2014.0,hospital & health care,201 - 500,"san francisco, california, united states",united states,linkedin.com/company/joinhonor,106,152,Honor Technology is a healthtech company focus...


# Creating a custom stemmer
Creating a stemming function to remove all punctuation and remove all stop words from LLM summary. It is important to stem the words consistently to highlight similarities across different comapny descriptions

In [6]:
stemmer = PorterStemmer()

def stem_tokenizer(text):
    no_punc = text.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(no_punc.lower())
    return [stemmer.stem(word) for word in tokens if word not in ENGLISH_STOP_WORDS]

# TF-IDF vectorization
Tf-IDF vectorization of LLM summaries and appending them to baseline dataset

In [7]:
vect = TfidfVectorizer(min_df = 10, ngram_range = (1,3), tokenizer = stem_tokenizer)
tfidf_mat = vect.fit_transform(df.summary)
mat_as_df = pd.DataFrame.sparse.from_spmatrix(tfidf_mat)
final_df = pd.concat([df, mat_as_df], axis = 1)

In [8]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,...,20335,20336,20337,20338,20339,20340,20341,20342,20343,20344
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2309813,us army,goarmy.com,1800.0,military,10001+,"alexandria, virginia, united states",united states,linkedin.com/company/us-army,162163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2959148,cognizant technology solutions,cognizant.com,1994.0,information technology and services,10001+,"teaneck, new jersey, united states",united states,linkedin.com/company/cognizant,122031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5944912,walmart,walmartcareers.com,1962.0,retail,10001+,"withee, wisconsin, united states",united states,linkedin.com/company/walmart,120753,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3300741,at&t,att.com,1876.0,telecommunications,10001+,"dallas, texas, united states",united states,linkedin.com/company/at&t,115188,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
final_df = final_df.drop('Unnamed: 0', axis = 1)
final_df = final_df.dropna()
final_df.head()


Unnamed: 0,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate,...,20335,20336,20337,20338,20339,20340,20341,20342,20343,20344
0,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,us army,goarmy.com,1800.0,military,10001+,"alexandria, virginia, united states",united states,linkedin.com/company/us-army,162163,445958,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,cognizant technology solutions,cognizant.com,1994.0,information technology and services,10001+,"teaneck, new jersey, united states",united states,linkedin.com/company/cognizant,122031,210020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,walmart,walmartcareers.com,1962.0,retail,10001+,"withee, wisconsin, united states",united states,linkedin.com/company/walmart,120753,272827,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,at&t,att.com,1876.0,telecommunications,10001+,"dallas, texas, united states",united states,linkedin.com/company/at&t,115188,269659,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
final_df = final_df.reset_index()
final_df.head()

Unnamed: 0,index,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,...,20335,20336,20337,20338,20339,20340,20341,20342,20343,20344
0,0,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,us army,goarmy.com,1800.0,military,10001+,"alexandria, virginia, united states",united states,linkedin.com/company/us-army,162163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,cognizant technology solutions,cognizant.com,1994.0,information technology and services,10001+,"teaneck, new jersey, united states",united states,linkedin.com/company/cognizant,122031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,walmart,walmartcareers.com,1962.0,retail,10001+,"withee, wisconsin, united states",united states,linkedin.com/company/walmart,120753,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,at&t,att.com,1876.0,telecommunications,10001+,"dallas, texas, united states",united states,linkedin.com/company/at&t,115188,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Dimensionality Reduction
The dataset is now too large and needs to be reduced. The dataset is reduced to 10 principal components and appended with their names as labels

In [11]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

name = final_df['name']

tobe_pca_df = final_df.drop(labels=['name', 'domain', 'industry', 'size range', 'locality', 'country', 'linkedin url', 'summary'], axis = 1)

scaler = StandardScaler()
scaled_df = scaler.fit_transform(tobe_pca_df)

pca = PCA(n_components = 10)
pca.fit(scaled_df)
pca_data = pca.transform(scaled_df)

pca_df = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
pca_df['name'] = name

# Exporting PCA dataset to csv

In [31]:
import numpy as np
from tqdm import tqdm

# Split the DataFrame into chunks
pca_chunks = np.array_split(pca_df, 200)

# Save each chunk to CSV with progress tracking
with open('pca_data.csv', 'w') as f:
    for i, chunk in enumerate(tqdm(pca_chunks)):
        if i == 0:
            chunk.to_csv(f, index=False)
        else:
            chunk.to_csv(f, header=False, index=False, mode='a')


100%|██████████| 200/200 [00:00<00:00, 677.59it/s]


# Exporting raw data
Attempt to extract the raw data to be compared against with the clustering. THe dataset was too large, and this process could not be completed.

In [None]:
raw_chunks = np.array_split(final_df, 10000)
with open('raw_data.csv', 'w') as f:
    for i, raw_chunk in enumerate(tqdm(raw_chunks)):
        if i == 0:
            raw_chunk.to_csv(f, index=False)
        else:
            raw_chunk.to_csv(f, header=False, index=False, mode='a')