### Machine Learning for Econonomics Journal Abstracts

In [2]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [113]:
topJourns_df = pd.read_csv("raw_data_econ/topRanks_cleaned.csv", encoding = "'iso-8859-1'")
hiJourns_df = pd.read_csv("raw_data_econ/hiRanks_cleaned.csv", encoding = "'iso-8859-1'")
notHiJourns_df = pd.read_csv("raw_data_econ/notHiRanks_cleaned.csv", encoding = "'iso-8859-1'")
notHiJourns2_df = pd.read_csv("raw_data_econ/notHiRanks2_cleaned.csv", encoding = "'iso-8859-1'")

frames = [topJourns_df, hiJourns_df, notHiJourns_df, notHiJourns2_df]
combined_df = pd.concat(frames)
combined_df["abstract"].shape

(8229,)

In [142]:
# split data into test & train
X = combined_df["abstract"]
y = combined_df["top_journal"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# transform X and y to lists for processing
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

In [143]:
# Fit to logistic regression function
classifier = LogisticRegression()

#word to vector
tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))
hash_vectorizer = HashingVectorizer(analyzer='word', ngram_range=(1, 3),n_features=50000)
X_train=hash_vectorizer.fit_transform(X_train)
X_test=hash_vectorizer.fit_transform(X_test)

classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [86]:
combined_df.head()

Unnamed: 0.1,Unnamed: 0,abstract,top_journal
0,0,We propose local measure relationship paramete...,1
1,1,The labor market increasingly rewards social s...,1
2,2,We develop theory endogenous uncertainty busin...,1
3,3,What shapes optimal degree progressivity tax t...,1
4,4,An increase household debt GDP ratio predicts ...,1


In [144]:
# training and testing data score
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9009884945713823
Testing Data Score: 0.6982507288629738


In [88]:
# Making predictions
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Actual,Prediction
0,1,1
1,1,1
2,0,0
3,0,1
4,0,0
5,0,1
6,1,1
7,0,1
8,1,1
9,0,0


In [145]:
from sklearn.metrics import classification_report
target_names = ["Not top Journal", "Top Journal"]
report = classification_report(y_test, predictions, target_names=target_names)
print(report)

                 precision    recall  f1-score   support

Not top Journal       0.71      0.68      0.70      1034
    Top Journal       0.69      0.71      0.70      1024

    avg / total       0.70      0.70      0.70      2058



# Naive Bayes

In [146]:
combined_df = combined_df[['abstract','top_journal']]
combined_df.head()

Unnamed: 0,abstract,top_journal
0,We propose local measure relationship paramete...,1
1,The labor market increasingly rewards social s...,1
2,We develop theory endogenous uncertainty busin...,1
3,What shapes optimal degree progressivity tax t...,1
4,An increase household debt GDP ratio predicts ...,1


In [147]:
data = combined_df #text in column 1, classifier in column 2.
import numpy as np
numpy_array = data.as_matrix()
X = combined_df["abstract"]
#X=X.astype('float')
Y = combined_df["top_journal"]
#Y=Y.astype('float')
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
 X, Y, test_size=0.4, random_state=42)

In [148]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB

In [149]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
 ('tfidf', TfidfTransformer()),
 ('clf', MultinomialNB()),
])

In [150]:
text_clf = text_clf.fit(X_train,Y_train)

In [151]:
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)


0.672539489671932

# SVM

In [152]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-4, n_iter=2, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train, Y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == Y_test)



0.6713244228432563

# Grid Search

In [27]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [28]:
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, Y_train)

In [29]:
# To see the best mean score and the params, run the following code

gs_clf.best_score_
gs_clf.best_params_

# Output for above should be: The accuracy has now increased to ~90.6% for the NB classifier (not so naive anymore! 😄)
# and the corresponding parameters are {‘clf__alpha’: 0.01, ‘tfidf__use_idf’: True, ‘vect__ngram_range’: (1, 2)}.

{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [30]:
# Similarly doing grid search for SVM
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, Y_train)


gs_clf_svm.best_score_
gs_clf_svm.best_params_



{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

In [None]:
#Stemming Code

import nltk
nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(X_train, Y_train)

predicted_mnb_stemmed = text_mnb_stemmed.predict(X_test)

np.mean(predicted_mnb_stemmed == Y_test)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


# TREE

In [97]:
from sklearn import tree

In [124]:
X_scaler = StandardScaler().fit(X_train.values.reshape(-1, 1))

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
 encoded_X, Y, test_size=0.4, random_state=42)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[1090. 5347. 4402. ... 4317. 7371. 2804.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

# Random Forest 

In [122]:

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, Y_test)

ValueError: could not convert string to float: 'Conventional economic analyses successful explaining differences living arrangements particularly dramatic increase fraction young adults living parents Mediterranean Europe . This paper presents cultural interpretation . I argue sexual revolution 1970s-by liberalizing parental attitudes-had differential impact living arrangements Northern Southern Europe account closer parent-child ties Southern Europe . Such interpretation easily explain shift living arrangements time also observed North-South differentials . It receives support data living arrangements second-generation immigrants United States , 1970 2000 . This duplication European pattern neutral environment , unemployment benefits , welfare code , macroeconomic conditions suggests major role culture determining living arrangements . ( JEL : D1 , J1 , Z13 ) ( c ) 2007 European Economic Association .'