# SM Prototype 10K Train/Test Split, Vectorization, Linear Regression

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from utils import clean_text
import edgar

# Expand the max width of how our dataFrames display on screen
pd.options.display.max_colwidth = 500

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yural\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Vectorization methods
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Classification model
from sklearn.linear_model import LinearRegression

### Train/Test Split

Train will be a 80% split of the corpus of documents comprising of 10K and 10Qs for the companies spanning time.
Test will be the 20% remaining documents.

Documents will be the features, and Labels will be the stock price change label (positive or negative)

There will be 3 main models: one for the stock change at 1 day, one for the stock change at 1 month, one for the stock change at 3 months (before the next quarter)

In [3]:
#first pass with just SM 10-K documents only
company = edgar.Company("SM Energy", "0000893538")
tree = company.getAllFilings(filingType = "10-K")
docs = edgar.getDocuments(tree, noOfDocuments=6) #2014, 2015, 2016, 2017, 2018

In [4]:
text2018 = docs[0]
text2017 = docs[1]
text2016 = docs[2]
text2015 = docs[3]
text2014 = docs[4]

In [5]:
# Create dataframe from data
docs_df = pd.DataFrame(columns=['Filing_Type','Filing_Date','Company','Text','Label'])
docs_df
docs_df = docs_df.append({'Filing_Type': '10-K','Filing_Date': 2018,'Company':'SM', 'Text' :text2018,'Label' :'1'},ignore_index=True)
docs_df = docs_df.append({'Filing_Type': '10-K','Filing_Date': 2017,'Company':'SM', 'Text' :text2017,'Label' :'0'},ignore_index=True)
docs_df = docs_df.append({'Filing_Type': '10-K','Filing_Date': 2016,'Company':'SM', 'Text' :text2016,'Label' :'1'},ignore_index=True)
docs_df = docs_df.append({'Filing_Type': '10-K','Filing_Date': 2015,'Company':'SM', 'Text' :text2015,'Label' :'0'},ignore_index=True)
docs_df = docs_df.append({'Filing_Type': '10-K','Filing_Date': 2014,'Company':'SM', 'Text' :text2014,'Label' :'1'},ignore_index=True)

docs_df.head()

Unnamed: 0,Filing_Type,Filing_Date,Company,Text,Label
0,10-K,2018,SM,"\n10-K\n1\nsm-20181231x10k.htm\n10-K\n\n\n\n\t\n\t\t\n\t\t\n\t\tDocument\n\t\n\t\nUNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-Kþ Annual Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934For the fiscal year ended December 31, 2018 oro Transition Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934Commission file number 001-31539SM ENERGY COMPANY(Exact name of registrant as specified in its charter)Delaw...",1
1,10-K,2017,SM,"\n10-K\n1\nsm-20171231x10k.htm\n10-K\n\n\n\n\t\n\t\t\n\t\t\n\t\tDocument\n\t\n\t\nUNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-Kþ Annual Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934For the fiscal year ended December 31, 2017 oro Transition Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934Commission file number 001-31539SM ENERGY COMPANY(Exact name of registrant as specified in its charter)Delaw...",0
2,10-K,2016,SM,"\n10-K\n1\nsm-20161231x10k.htm\n10-K\n\n\n\n\t\n\t\t\n\t\t\n\t\tDocument\n\t\n\t\nUNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-Kþ Annual Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934For the fiscal year ended December 31, 2016 oro Transition Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934Commission file number 001-31539SM ENERGY COMPANY(Exact name of registrant as specified in its charter)Delaw...",1
3,10-K,2015,SM,"\n10-K\n1\nsm-20151231x10k.htm\n10-K\n\n\n\n\t\n\t\t\n\t\t\n\t\t10-K\n\t\n\t\nUNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-Kþ Annual Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934For the fiscal year ended December 31, 2015 oro Transition Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934Commission file number 001-31539SM ENERGY COMPANY(Exact name of registrant as specified in its charter)Delaware(...",0
4,10-K,2014,SM,"\n10-K\n1\nsm-20141231x10k.htm\n10-K\n\n\n\n\t\n\t\t\n\t\t\n\t\tSM-2014.12.31-10K\n\t\n\t\nUNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-Kþ Annual Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934For the fiscal year ended December 31, 2014 oro Transition Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934Commission file number 001-31539SM ENERGY COMPANY(Exact name of registrant as specified in its char...",1


In [6]:
### Assign X (data) and y (target)
X = docs_df.drop("Label", axis=1)
y = docs_df["Label"]
print(X.shape, y.shape)

(5, 4) (5,)


In [7]:
#train/test split at 80%
from sklearn.model_selection import train_test_split
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
#look at train/test split
XTrain
yTrain
XTest
yTest

1    0
Name: Label, dtype: object

In [9]:
#Create train/test dataframes
n_docs = 100000 #number of times to run model
train_docs = XTrain
train_labels = yTrain
test_docs = XTest
test_labels = yTest

train_df = pd.DataFrame({"text": train_docs['Text'], "labels": train_labels})
train_df

test_df = pd.DataFrame({"text": test_docs['Text'], "labels": test_labels})
test_df

print(f"Train Shape: {train_df.shape}")
print(f"Test Shape: {test_df.shape}")

Train Shape: (4, 2)
Test Shape: (1, 2)


## CountVectorization

Turn the raw text from TrainText and TestText into feature vectors so that we can use them in our model.
We vectorize the text in 2 steps: 
1. First, we `fit`, the training data to our vectorizer to compute the vocabulary (feature set). 
2. Then, we `transform` with our text for both train and test to count the number occurrences for each word in our vocabulary.

The output of the CountVectorizer's `transform` task is a [sparse matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix), which condenses the matrix values to avoid storing an excessive amount of zeros.

In [10]:
vectorizer = CountVectorizer(stop_words='english') ##could limit the words being used in here
vectorizer.fit(train_df['text']) ##fit training data's text to get the feature set
train_vecs = vectorizer.transform(train_df['text']) ##transform with training to count word frequencies
test_vecs = vectorizer.transform(test_df['text']) ##transform with test data to count word frequencies

In [11]:
#full vocabulary size for training data
print(f"Number of documents: {train_vecs.shape[0]}")
print(f"Size of vocabulary: {train_vecs.shape[1]}")

#full vocabulary size for testing data
print(f"Number of documents: {test_vecs.shape[0]}")
print(f"Size of vocabulary: {test_vecs.shape[1]}")

Number of documents: 4
Size of vocabulary: 8548
Number of documents: 1
Size of vocabulary: 8548


In [12]:
#get rid of non-zero values because it's a sparse matrix 
# Train
print(f"Number of TRAINING non-zero features: {train_vecs.nnz}")
print(f"Number of TRAINING zero features: {(train_vecs.shape[0]*train_vecs.shape[1])-train_vecs.nnz}")

# Test
print(f"Number of TEST non-zero features: {test_vecs.nnz}")
print(f"Number of TEST zero features: {(test_vecs.shape[0]*test_vecs.shape[1])-test_vecs.nnz}")

Number of TRAINING non-zero features: 21852
Number of TRAINING zero features: 12340
Number of TEST non-zero features: 4715
Number of TEST zero features: 3833


### Display a few terms and their tf-idf scores for a few documents. 

This is only meant to be used for demonstration purposes. The cell below has no impact on the actual execution of our task. Also, this cell is only intended for use when the number of documents is small (<100), otherwise it will likely only display a bunch of zeros.



In [13]:
df_counts = pd.DataFrame(train_vecs.toarray(), 
                         columns=vectorizer.get_feature_names())[:15].T

df_counts.sort_values(by = [0], ascending = False) 
#df_counts.to_csv("sm10k1.csv")

Unnamed: 0,0,1,2,3
gas,544,540,500,547
company,526,590,590,499
2014,492,207,24,375
oil,466,470,473,448
december,360,488,443,402
31,352,482,449,403
2013,327,31,18,191
million,317,399,341,339
production,316,306,361,311
net,313,385,377,324


## Term Frequency-Inverse Document Frequency (TF-IDF)

Tf-idf is a statistical representation of how relevant a word is to a particular document within a corpus. _Relevance_, in this scenario, can be defined as how much information a word provides about the context of one document vs all other documents in the corpus. 

In short, tf-idf is calculated by comparing the number of times that a particular terms occurs in a given document vs the number of other documents in the corpus that contain that word. A word that frequently occurs in 1 document, but only occurs in a very small number of other documents will have a high tf-idf score.

The calculation for tf-idf is the product of two smaller calculations:

$$TF_{i,j} = \frac{Number~of~times~word_{i}~occurs~in~document_{j}}{Total~number~of~words~in~document_{j}}$$


$$IDF_{i} = log(\frac{Total~number~of~documents~in~corpus}{Number~of~documents~that~contain~word_{i}})$$

##### Example: 

Let's say we have 10,000 documents about the solar system. If we were to take one single document with 200 terms and see that _Europa_ (one of Jupiter's moons) was mentioned 5 times, then _Europa's_ term frequency (tf) for that document would be: 

$$TF_{Europa, document} = \frac{5}{200}=0.025$$


Now if we were to see that _Europa_ only occurs in 50 of the total 10,000 documents, then the inverse document frequency (idf) would be: 

$$IDF_{Europa} = log(\frac{10,000}{50})=2.3$$

Therefore our tf-idf score for _Europa_ for that given document would be:

$$ 0.025 * 2.3 = 0.575 $$

This might actually be useful for us because we can see if specific terms are especially important to certain documents relative to others -- this only matters because maybe we can see when acquisitions might be on the horizon, for example.

In [14]:
#tf-idf vectorization using calculation
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer.fit(train_df['text'])
train_tfidf_vecs = tfidf_vectorizer.transform(train_df['text'])
test_tfidf_vecs = tfidf_vectorizer.transform(test_df['text'])

### Display a few terms and their tf-idf scores for a few documents

This is only meant to be used for demonstration purposes. The cell below has no impact on the actual execution of our task. Also, this cell is only intended for use when the number of documents is small (<100), otherwise it will likely only display a bunch of zeros.

In [15]:
#alice note -- obviously need to clean this up a bit.
df_tfidf = pd.DataFrame(train_tfidf_vecs.toarray(), 
                         columns=tfidf_vectorizer.get_feature_names())[:15].T
df_tfidf.tail(20)

Unnamed: 0,0,1,2,3
years,0.072288,0.073252,0.067591,0.077058
years1,0.0,0.0,0.0,0.000935
years140,0.000949,0.0,0.0,0.0
years9,0.0,0.000836,0.0,0.0
yearslong,0.000495,0.000436,0.00046,0.000488
yes,0.001485,0.001308,0.002299,0.001463
yesþ,0.001211,0.001067,0.0,0.001193
yield,0.002971,0.002616,0.003219,0.002926
yield0,0.000495,0.000436,0.00046,0.000488
yielding,0.000495,0.000436,0.00046,0.000488


#### Comparison of the representation of different words vs. straight up word frequency

In [16]:
pd.DataFrame({"TF-IDF of stock":df_tfidf.loc['stock'], "CountVectorizer: Stock":df_counts.loc['stock']})

Unnamed: 0,TF-IDF of stock,CountVectorizer: Stock
0,0.084666,171
1,0.082844,190
2,0.068971,150
3,0.072669,149


In [17]:
pd.DataFrame({"TF-IDF of drilling":df_tfidf.loc['drilling'], "CountVectorizer: Drilling":df_counts.loc['drilling']})

Unnamed: 0,TF-IDF of drilling,CountVectorizer: Drilling
0,0.094073,190
1,0.07674,176
2,0.06989,152
3,0.081935,168


# Modeling -- classifying whether certain terms appearing will result in a positive or negative stock price

Vectorizing our data has converted our text data into a numeric feature set. Using these vectors, we can now begin to develop machine learning models for things like classification.

To further this model, look into better preprocessing, regression regularization, vocabulary pruning for feature selection, and hyperparameter tuning.

In [18]:
#determine regression model
from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
classifier.fit(train_vecs, train_df['labels'])
predictions = classifier.predict(test_vecs)
print(f"Prediction   {predictions[:10]}")
print(f"Actual labels: {test_df['labels']}")

Prediction   [1.05017068]
Actual labels: 1    0
Name: labels, dtype: object


In [24]:
#Run a linear regression classification on the count vectors
count_linearReg = LinearRegression()
count_linearReg.fit(train_vecs, train_df['labels']) 
#train_vecs = frequency/ terms (unique features for this document)
#['labels'] is labeling if the change in stock price was positive or negative for the time frame
count_preds = count_linearReg.predict(test_vecs) 
#model.coef_ -> gives you coefficient values for all of terms. then look to see which have high weights / low weights. 
#terms are being weighted off of the change in the stock price. 
#if you see term in high weight (occurrence results in stock going up), (if low / negative weight, stock going down)

# Calculate the percentage of accurate predictions
accuracy = np.mean(count_preds==test_df['labels'])
print(f"LinearReg CountVectorizer accuracy: {accuracy}") ##linear regression, fit with the frequency*weights, 

LinearReg CountVectorizer accuracy: 0.0


### Run a logistic regression classification on the TF-IDF vectors

In [25]:
tfidf_linReg = LinearRegression()
tfidf_linReg.fit(train_tfidf_vecs, train_df['labels'])
tfidf_preds = tfidf_linReg.predict(test_tfidf_vecs)

# Calculate the percentage of accurate predictions
accuracy = np.mean(tfidf_preds==test_df['labels'])
print(f"LinReg TF-IDF accuracy: {accuracy}")

LinReg TF-IDF accuracy: 0.0


### View the terms with the highest coefficient values for each category

Notice that the terms highly weighted for each category seem to have highly negative weights for other categories. If we were to use more similarly related categories, we may not see such drastic differences.

Ignore the code behind this table. It is poorly written, but demonstrates the correct results.

In [28]:
from utils import getTopCoefs

getTopCoefs(num_terms=5, model=tfidf_linReg, class_labels=train_df['labels'], feature_names=tfidf_vectorizer.get_feature_names())

TypeError: 'numpy.float64' object is not iterable

### View coefficient weights for CountVectorizer features

In [32]:
getTopCoefs(num_terms=5, model=count_linearReg, class_labels=train_df['labels'], feature_names=vectorizer.get_feature_names())

TypeError: 'numpy.float64' object is not iterable

In [35]:
#Predicted inaccurately:
# Expand the max width of how our dataFrames display on screen
pd.options.display.max_colwidth = 1000

# Compile a dataframe with our text, the actual label, and the predicted label
final_df = pd.DataFrame({"text": test_df['text'], "Actual": test_df['labels'], "Prediction": tfidf_preds})

# Display the rows of our dataframe where the actual label and predicted label don't match
final_df.loc[(final_df['Actual'] != final_df['Prediction'])]

Unnamed: 0,text,Actual,Prediction
1,"\n10-K\n1\nsm-20171231x10k.htm\n10-K\n\n\n\n\t\n\t\t\n\t\t\n\t\tDocument\n\t\n\t\nUNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-Kþ Annual Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934For the fiscal year ended December 31, 2017 oro Transition Report Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934Commission file number 001-31539SM ENERGY COMPANY(Exact name of registrant as specified in its charter)Delaware(State or other jurisdiction of incorporation or organization)41-0518430(I.R.S. Employer Identification No.)1775 Sherman Street, Suite 1200, Denver, Colorado(Address of principal executive offices)80203(Zip Code)(303) 861-8140(Registrant’s telephone number, including area code)Securities registered pursuant to Section 12(b) of the Act:Title of each class Name of each exchange on which registeredCommon stock, $.01 par value New York Stock ExchangeSecurities registered pursuant to Section 12(g)...",0,1.060073
