In [13]:
import pandas as pd
import numpy as np
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import shuffle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [14]:
company_df = pd.read_stata('Company.dta')

In [15]:
nltk.download('punkt')
# Download the NLTK stop words
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Haotian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Haotian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stops = stopwords.words('english')

In [17]:
print(company_df['description'] )

0           Operator of a restaurant and food service show.
1         Provider of educational services. The company ...
2         Provider of an entrepreneur mentoring platform...
3         Operator of a commercial bank intended to prov...
4         Operator of a chain of restaurants intended to...
                                ...                        
442094    Manufacturer of concrete pumping systems for c...
442095    Manufacture of packaged chlorine products. The...
442096    Developer of small molecule drugs designed to ...
442097    Developer of antibacterial agents designed to ...
442098    Developer of production, operations, and quali...
Name: description, Length: 442099, dtype: object


In [18]:
company_df['description'] = company_df['description'].apply(lambda x: x.lower())
company_df['description'] = company_df['description'].apply(word_tokenize)

In [19]:
# Step 2: Collect all unique words
unique_words = set()
for doc in company_df['description']:
    unique_words.update(doc)
print(unique_words)
print(len(unique_words))

# Step 3: Calculate word frequencies
word_counts = collections.Counter()
for doc in company_df['description']:
    word_counts.update(doc)
print(word_counts)
print(len(word_counts))

188019
188019


In [20]:
word_counts_per_document = collections.defaultdict(int)
total_documents = len(company_df)

for i, row in company_df.iterrows():
    words = row['description']
    unique_words = set(words)  # Consider only unique words within each document
    for word in unique_words:
        word_counts_per_document[word] += 1

In [None]:
sorted(word_counts_per_document.items(), key=lambda x:x[1], reverse= True)

In [22]:
# Step 4: Discard common words (e.g., words used in > 5% of all documents)
frequency_threshold = 0.05 * total_documents  # Adjust the threshold as needed
non_common_words = [word for word, count in word_counts_per_document.items() if count <= frequency_threshold]

# Step 5: Create the main dictionary (N non-common words)
main_dictionary = set(non_common_words)

In [None]:
main_dictionary

In [99]:
temp = pd.read_stata('M&A pairs.dta')
temp['description_similarity_score'] = final_similarity_scores


In [100]:
temp.to_csv('temp.csv', index=False)

In [82]:
chunk_size = 5000
ma_df = pd.read_stata('M&A pairs.dta', chunksize=chunk_size)
first_chunk = next(ma_df)


In [84]:
len(first_chunk)

5000

In [85]:
#ma_df['description_a'] = ma_df['description_a'].apply(lambda x: ' '.join(x))
#ma_df['description_t'] = ma_df['description_t'].apply(lambda x: ' '.join(x))

desc_a = first_chunk['description_a'].apply(lambda x: x.lower())
desc_t = first_chunk['description_t'].apply(lambda x: x.lower())
mask_valid_desc = (~desc_a.isnull()) & (~desc_t.isnull()) & (desc_a != "") & (desc_t != "")

In [86]:
# Filter pairs with valid descriptions
 valid_desc_a= desc_a[mask_valid_desc]
valid_desc_t = desc_t[mask_valid_desc]

In [87]:
tfidf_vectorizer_desc = TfidfVectorizer(vocabulary=main_dictionary, norm='l2',stop_words=stops)
tfidf_matrix_a = tfidf_vectorizer_desc.fit_transform(valid_desc_a)
tfidf_matrix_t = tfidf_vectorizer_desc.transform(valid_desc_t)
# Compute cosine similarities for valid pairs
similarities = cosine_similarity(tfidf_matrix_a, tfidf_matrix_t)

# Extract diagonal to get pairwise similarities
similarity_scores = similarities.diagonal()
similarity_results = np.full(len(desc_a), np.nan)
similarity_results[mask_valid_desc] = similarity_scores

In [89]:
# Initialize a list to store similarity results for all chunks
all_similarity_results = []

# Process the first chunk (which we already have)
all_similarity_results.append(similarity_results)

# Process the remaining chunks
for chunk in ma_df:
    desc_a = chunk['description_a']
    desc_t = chunk['description_t']

    # Create a mask for pairs where both descriptions are present
    mask_valid_desc = (~desc_a.isnull()) & (~desc_t.isnull()) & (desc_a != "") & (desc_t != "")

    # Filter pairs with valid descriptions
    valid_desc_a = desc_a[mask_valid_desc]
    valid_desc_t = desc_t[mask_valid_desc]

    # Calculate TF-IDF vectors
    tfidf_matrix_a = tfidf_vectorizer_desc.transform(valid_desc_a)  # Note: We use transform instead of fit_transform
    tfidf_matrix_t = tfidf_vectorizer_desc.transform(valid_desc_t)

    # Compute cosine similarities for valid pairs
    similarities = cosine_similarity(tfidf_matrix_a, tfidf_matrix_t)

    # Extract diagonal to get pairwise similarities
    similarity_scores = similarities.diagonal()

    # Assign NaN to pairs with missing/empty descriptions
    similarity_results_chunk = np.full(len(desc_a), np.nan)
    similarity_results_chunk[mask_valid_desc] = similarity_scores
    
    # Append to the results list
    all_similarity_results.append(similarity_results_chunk)

# Combine results from all chunks
final_similarity_scores = np.concatenate(all_similarity_results)




array([       nan,        nan, 0.0544101 ,        nan, 0.10545674,
              nan,        nan,        nan,        nan,        nan])

In [91]:
len(final_similarity_scores)

119550

In [73]:
valid_desc_t

2         provider of an entrepreneur mentoring platform...
4         operator of a logistics company intended to pr...
10        provider of social media in the hospitality in...
12        provider of power line construction and energy...
13        developer of software-defined storage technolo...
                                ...                        
119516    developer of an online fleet management platfo...
119521    iex media nv is an independent financial websi...
119523    saigon beer alcohol beverage corp operates in ...
119536    operator of a payment system bank.. the compan...
119541    developer of advanced optical technologies des...
Name: description_t, Length: 37846, dtype: object

In [12]:
def preprocess_text(text):
    words = text.split()
    words = [word.lower() for word in words if word.lower() in main_dictionary]
    return ' '.join(words)

In [None]:
def preprocess(input_dataset, input_column):
    input_dataset[input_column] = input_dataset[input_column].apply(lambda x: x.lower())
    input_dataset[input_column] = input_dataset[input_column].apply(word_tokenize)

    # Step 2: Collect all unique words
    unique_words = set()
    for doc in input_dataset[input_column]:
        unique_words.update(doc)

    # Step 3: Calculate word frequencies
    word_counts = collections.Counter()
    for doc in input_dataset[input_column]:
        word_counts.update(doc)

    word_counts_per_document = collections.defaultdict(int)
    total_documents = len(input_dataset)

    for i, row in input_dataset.iterrows():
        words = row[input_column]
        unique_words = set(words)  # Consider only unique words within each document
        for word in unique_words:
            word_counts_per_document[word] += 1

    # Calculate the total number of documents
    total_documents = len(input_dataset)

    # Step 4: Discard common words (e.g., words used in > 5% of all documents)
    frequency_threshold = 0.05 * total_documents  # Adjust the threshold as needed
    non_common_words = [word for word, count in word_counts_per_document.items() if count <= frequency_threshold]

    # Step 5: Create the main dictionary (N non-common words)
    main_dictionary = set(non_common_words)

    # Join the tokenized words in the descriptions back into a single string
    input_dataset[input_column] = input_dataset[input_column].apply(lambda x: ' '.join(x))
    input_dataset[input_column] = input_dataset[input_column].apply(preprocess_text)

In [None]:
company_df

In [None]:
ma_df

In [None]:
ma_df.merge(company_df, how='left', left_on='companyname_a', right_on='companyname')