## Modeling - Logistic Regression

In [29]:
# Importing packages

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [30]:
# Importing 'df.csv'

df = pd.read_csv('data/df.csv')
df.head()

Unnamed: 0,subreddit,title
0,sanfrancisco,"Federal complaint alleges bribery, corruption ..."
1,sanfrancisco,Small Venue
2,sanfrancisco,I found some common ground!
3,sanfrancisco,Bells of Dracula at the Edwardian Ball
4,sanfrancisco,Safe to run 3-4 miles...


In [31]:
# Setting X and y variables to 'title' and 'subreddit'
# Conducting a train_test_split to split the dataframe into train and test subsets

X = df['title']
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=13)

In [32]:
# Instantiating a CountVectorizer. Using english stopwords from the sklearn.feature_extraction.text package.
# Fitting the CountVectorizer to the training data.

ctvc = CountVectorizer(stop_words = 'english')
ctvc.fit(X_train);

In [33]:
# Instantiating a pipeline for the CountVectorizer and Logistic Regression. This will transform and fit the data
# to a logistic regression model. 

pipe = Pipeline([
    ('ctvc' , CountVectorizer()),
    ('lr' , LogisticRegression(solver='lbfgs', multi_class='auto'))
])

In [34]:
# Stating a range of hyperparameters to GridSearch over. These hyperparameters will all be tested in combination
# with one another to find the combination of parameters that will result in the highest model score. 

pipe_params = {
    'ctvc__max_features': [None, 5000],
    'ctvc__max_df': [.8, .9, .95],
    'ctvc__ngram_range': [(1,1),(1,2)]
}

In [35]:
# Conducting a GridSearch of 'pipe' with the possible hyperparameters in 'pipe_params'
# Fitting the resulting gridseach model to the training data

gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=5)
gs.fit(X_train, y_train);

In [36]:
gs.best_params_

{'ctvc__max_df': 0.8, 'ctvc__max_features': None, 'ctvc__ngram_range': (1, 1)}

In [37]:
# Scoring the training and testing data using the model 'gs'

gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.9866666666666667, 0.625)

In [38]:
# Generating predictions of which city each title is from and putting this into a DataFrame called
# 'train_predictions_df'

predictions_train = gs.predict(X_train)
train_predictions_df = pd.DataFrame({
    'actual': y_train,
    'predicted': predictions_train,
    'title': X_train,
})
train_predictions_df.head()

Unnamed: 0,actual,predicted,title
2368,Austin,Austin,All the cursed places you can name
640,sanfrancisco,sanfrancisco,City officials warn residents to beware blessi...
481,sanfrancisco,sanfrancisco,I made a Sutro Tower 3D laser cut wooden model...
144,sanfrancisco,sanfrancisco,"Like a fine wine or a smokin’ China&gt;Rider, ..."
3908,Atlanta,Atlanta,Atlanta Meetup


In [39]:
# Generating and saving predictions for the test data

predictions_test = gs.predict(X_test)
test_predictions_df = pd.DataFrame({
    'actual': y_test,
    'predicted': predictions_test,
    'title': X_test
})
test_predictions_df.head()

Unnamed: 0,actual,predicted,title
683,sanfrancisco,sanfrancisco,"Just moved here from Chicago, I could get used..."
531,sanfrancisco,Austin,London Breed about to meet 1000 shelter bed goal
520,sanfrancisco,sanfrancisco,A FedEx truck hit a fire hydrant on Tehama str...
3250,Atlanta,Atlanta,Another fire under the 85 connector Bridge!
3282,Atlanta,Atlanta,What is the MUST SEE/EAT in Atlanta?!


In [40]:
# Creating a DataFrame called 'words_df' that has every word in the corpus as well as the subreddit that each word
# is likely to be from

feature_names = ctvc.get_feature_names()
word_cities = gs.predict(feature_names)
words_df = pd.DataFrame({
    'word' : feature_names,
    'city' : word_cities
})


In [41]:
the_df = pd.concat([train_predictions_df, test_predictions_df])

In [42]:
# Exporting predictions for titles and individual words for both training and testing

the_df.to_csv('data/titles_df.csv')
words_df.to_csv('data/words_df.csv')

## Phrase / Word Tester

In [1]:
# To use the phrase / word tester, please input your phrase of choice between the
# quotations where it says 'test.' Delete the word test and enter your phrase or word.
gs.predict(['test'])[0]


NameError: name 'gs' is not defined