In [1]:
import pandas as pd
import numpy as np

import os
import requests

In [2]:
with open("S&P_500tickers.pkl", "rb") as f:
    cmp = pd.read_pickle(f)
    
df = pd.DataFrame(index=cmp, columns=["Description", "TF-IDF_LSA"])

overall_descriptions = []
cmp_folder_name = "company_desc/"
for curr_cmp in df.index:
    f_name = cmp_folder_name + curr_cmp + ".txt"
    
    with open(f_name, "r") as f:
        overall_descriptions.append(f.read())
df["Description"] = overall_descriptions

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from sklearn.decomposition import TruncatedSVD

from sklearn.cluster import KMeans, MiniBatchKMeans

In [4]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, 
                             stop_words='english', use_idf=True)
svd = TruncatedSVD(500)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(vectorizer, svd, normalizer)

X = lsa.fit_transform(overall_descriptions)


explained_variance = svd.explained_variance_ratio_.sum()
exp_print = "Explained variance of the SVD step: {}%"
print(exp_print.format(int(explained_variance * 100)))

Explained variance of the SVD step: 99%


In [11]:
df["TF-IDF_LSA"] = X.tolist()

print(df["TF-IDF_LSA"])

MMM      [0.3123167260825056, -0.13200870725526853, -0....
ABT      [0.18759489067030977, -0.05942228979796618, -0...
ABBV     [0.07446043008835211, -0.026481151525808183, -...
ACN      [0.32569195827268266, 0.02318840236383187, -0....
ATVI     [0.10932024518122525, -0.011418382379160106, -...
AYI      [0.15861368106571397, -0.06806452267446089, -0...
ADBE     [0.14696068381326877, -0.014989544286570441, -...
AMD      [0.10180190571788512, -0.02400159053932905, -0...
AAP      [0.18769798265622903, -0.08550942000281991, -0...
AES      [0.1607086305248354, -0.11487560962063018, 0.1...
AET      [0.2838450160064189, 0.16146903963917805, -0.1...
AMG      [0.21517639429794214, 0.1973565146114312, 0.09...
AFL      [0.21128481093810197, 0.1498009005659911, -0.0...
A        [0.1692153548792112, -0.055674607938386356, -0...
APD      [0.24222994711354215, -0.13037075545372045, 0....
AKAM     [0.25102828947246125, -0.0368300767541033, -0....
ALK      [0.09471732472883951, -0.03258742293606123, -0.

In [6]:
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics

import matplotlib
import matplotlib.pyplot as plt

plt.style.use("seaborn")

In [7]:
assignment = []

for k in [10, 25, 50]:
    classifier = KMeans(n_clusters=k, max_iter=100, verbose=0)
    classifier.fit(X)
    assignment.append(classifier.labels_)

In [8]:
df["Industry_0"] = assignment[0]
df["Industry_1"] = assignment[1]
df["Industry_2"] = assignment[2]

In [9]:
print(df[df["Industry_1"] == 0])

                                            Description  TF-IDF_LSA  \
ANDV  Andeavor, formerly Tesoro Corporation, incorpo...    0.225748   
COG   Cabot Oil & Gas Corporation, incorporated on D...    0.202067   
CHK   Chesapeake Energy Corporation (Chesapeake), in...    0.208597   
CVX   Chevron Corporation (Chevron), incorporated on...    0.210575   
COP   ConocoPhillips, incorporated on November 16, 2...    0.255927   
DVN   Devon Energy Corporation, incorporated on May ...    0.271693   
EQT   EQT Corporation (EQT), incorporated on June 10...    0.235475   
FCX   Freeport-McMoRan Inc. (FCX), incorporated on N...    0.085222   
HES   Hess Corporation, incorporated on February 7, ...    0.228446   
KMI   Kinder Morgan, Inc., incorporated on August 23...    0.234784   
MRO   Marathon Oil Corporation, incorporated on May ...    0.246736   
MPC   Marathon Petroleum Corporation, incorporated o...    0.144940   
OXY   Occidental Petroleum Corporation (Occidental),...    0.256384   
OKE   