In [1]:
import pandas as pd
import numpy as np

import os
import requests

In [2]:
with open("S&P_500tickers.pkl", "rb") as f:
    cmp = pd.read_pickle(f)
    
df = pd.DataFrame(index=cmp, columns=["Description", "TF-IDF_LSA"])

overall_descriptions = []
cmp_folder_name = "company_desc/"
for curr_cmp in df.index:
    f_name = cmp_folder_name + curr_cmp + ".txt"
    
    with open(f_name, "r") as f:
        overall_descriptions.append(f.read())
df["Description"] = overall_descriptions

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from sklearn.decomposition import TruncatedSVD

from sklearn.cluster import KMeans, MiniBatchKMeans

In [4]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, 
                             stop_words='english', use_idf=True)
new_dim = 500
svd = TruncatedSVD(new_dim)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(vectorizer, svd, normalizer)

X = lsa.fit_transform(overall_descriptions)


explained_variance = svd.explained_variance_ratio_.sum()
exp_print = "Explained variance of the SVD step: {}%"
print(exp_print.format(int(explained_variance * 100)))
print("Dimensionality {0} -> {1} reduced by {2}".format(
    len(overall_descriptions[0]), new_dim, abs(new_dim - len(overall_descriptions[0])) / len(overall_descriptions[0]))
     )

Explained variance of the SVD step: 100%
Dimensionality 10388 -> 500 reduced by 0.9518675394686177


In [5]:
df["TF-IDF_LSA"] = X.tolist()

In [6]:
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics

import matplotlib
import matplotlib.pyplot as plt

plt.style.use("seaborn")

In [7]:
assignment = []

for k in [10, 25, 50]:
    classifier = KMeans(n_clusters=k, max_iter=100, verbose=0)
    classifier.fit(X)
    assignment.append(classifier.labels_)

In [8]:
df["Industry_V0"] = assignment[0]
df["Industry_V1"] = assignment[1]
df["Industry_V2"] = assignment[2]

In [13]:
# Assignments done based on tf-idf vectorizer, then LSA, then normalizer

"""
Evaluation metrics:

1) use NMF to see the descriptions and see if they make sense

2) Use the correlation matrix and see if they are highly (+, -) related
"""

Index(['ALL', 'AIG', 'AIZ', 'CAT', 'DHI', 'HOG', 'RF'], dtype='object')
Index(['AYI', 'ALK', 'ALLE', 'AAL', 'APH', 'BA', 'BWA', 'DLPH', 'DAL', 'FLIR',
       'GRMN', 'GD', 'HRS', 'HON', 'IR', 'JCI', 'LLL', 'LMT', 'NOC', 'RTN',
       'COL', 'LUV', 'TEL', 'TXT', 'TDG', 'UAL', 'UTX'],
      dtype='object')
Index(['AAP', 'APD', 'AME', 'AMAT', 'ARNC', 'BLL', 'CMI', 'DE', 'ETN', 'FAST',
       'FTV', 'GPC', 'GT', 'GWW', 'ITW', 'ISRG', 'JBHT', 'LEG', 'LKQ', 'NUE',
       'PCAR', 'PH', 'PPG', 'PX', 'ROK', 'SHW', 'SNA', 'SWK', 'URI', 'WHR'],
      dtype='object')


In [57]:
from sklearn.decomposition import NMF

holder = []
inspection = df[df["Industry_V0"] == 0]
print(inspection.keys)
#group_inspection = inspection.corr()

<bound method NDFrame.keys of                                             Description  \
ALB   Albemarle Corporation, incorporated on Novembe...   
MO    Altria Group, Inc., incorporated on August 27,...   
AAL   American Airlines Group Inc., incorporated on ...   
ADM   Archer-Daniels-Midland Company, incorporated o...   
AVY   Avery Dennison Corporation (Avery Dennison), i...   
BLL   Ball Corporation (Ball), incorporated on Decem...   
BBY   Best Buy Co., Inc., incorporated on October 20...   
BF.B  Brown-Forman Corporation, incorporated on Octo...   
CPB   Campbell Soup Company, incorporated on Novembe...   
CCL   Carnival Corporation, incorporated on November...   
CF    CF Industries Holdings, Inc., incorporated on ...   
CMG   Chipotle Mexican Grill, Inc. (Chipotle), incor...   
CHD   Church & Dwight Co., Inc., incorporated on Dec...   
CLX   The Clorox Company, incorporated on September ...   
KO    The Coca-Cola Company, incorporated on Septemb...   
CL    Colgate-Palmolive Co