# LogisticRegression

In [71]:
# Import dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
import numpy as np
import pandas as pd 
from tpot import TPOTClassifier



In [72]:
#Load data
df0=pd.read_csv("../data/group_0.csv")
df1 = pd.read_csv("../data/group_1.csv")
df2=pd.read_csv("../data/group_2.csv")
cobn=[df0,df1,df2]
df=pd.concat(cobn)
# df=df. sample(n=50, axis=0)
df.label.value_counts()

2    42116
1    18703
0     5801
Name: label, dtype: int64

In [86]:
# split data into test & train
X = df["abstract"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)#stratify=y,train_size=0.75,

# transform X and y to lists for processing
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()
len(X_train),len(X_test)

(49965, 16655)

In [87]:
#word to vector
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1, 3))
hash_vectorizer = HashingVectorizer(analyzer='word',stop_words='english', ngram_range=(1, 3),n_features=2 ** 18)
X_train=hash_vectorizer.fit_transform(X_train)
X_test=hash_vectorizer.fit_transform(X_test)


In [110]:
# Fit to logistic regression function
classifier = LogisticRegression(solver="sag", multi_class='multinomial')
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [112]:
# training and testing data score
print(f"Training accuracy: {classifier.score(X_train, y_train)}")
print(f"Testing accuracy: {classifier.score(X_test, y_test)}")

# Training accuracy: 0.8489742819973982
# Testing accuracy: 0.7805463824677275

Training accuracy: 0.9018112678875213
Testing accuracy: 0.7905133593515461


In [113]:
# Making predictions
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head()

Unnamed: 0,Actual,Prediction
0,2,2
1,0,1
2,2,2
3,2,2
4,1,1


In [114]:
from sklearn.metrics import classification_report
target_names = ["Highly Prestigious Journals", "Medium Impact Journal","Low Impact Journal"]
report = classification_report(y_test, predictions, target_names=target_names)
print(report)

                             precision    recall  f1-score   support

Highly Prestigious Journals       0.77      0.22      0.34      1400
      Medium Impact Journal       0.65      0.66      0.66      4597
         Low Impact Journal       0.85      0.92      0.88     10658

                avg / total       0.79      0.79      0.77     16655



In [116]:
# Saving the model
import pickle
# pickle.dump(classifier,open('../models_trained/Paper_qlty_logReg_model_multinomial_sag', 'wb'))
classifier.classes_
#export the code


array([0, 1, 2])

In [117]:
# Test: loading in the pickled model
clf2 = pickle.load(open('../models_trained/Paper_qlty_logReg_model_multinomial_sag', 'rb'))

In [118]:
df0.abstract[10]

'Cancer progression involves the gradual loss of a differentiated phenotype and acquisition of progenitor and stem-cell-like features. Here, we provide novel stemness indices for assessing the degree of oncogenic dedifferentiation. We used an innovative one-class logistic regression (OCLR) machine-learning algorithm to extract transcriptomic and epigenetic feature sets derived from non-transformed pluripotent stem cells and their differentiated progeny. Using OCLR, we were able to identify previously undiscovered biological mechanisms associated with the dedifferentiated oncogenic state. Analyses of the tumor microenvironment revealed unanticipated correlation of cancer stemness with immune checkpoint expression and infiltrating immune cells. We found that the dedifferentiated oncogenic phenotype was generally most prominent in metastatic tumors. Application of our stemness indices to single-cell data revealed patterns of intra-tumor molecular heterogeneity. Finally, the indices allowe

In [119]:
verify_abst=df0.abstract[432]#"this is a test"#

Journal_class = {0:"has high probablity of published in HIGH Impact Journals", 
                 1:"has high probablity of published in MEDIUM Impact Journal",
                 2:"has NOT  a chance of published in HIGH or MEDIUM Impact Journal"}
def model_predict(s):
    string = []
    string.append(s)
    test = hash_vectorizer.fit_transform(string)
    result = clf2.predict(test)
    return result[0]


print(f"This abstract has high probablity to be published in {Journal_class[model_predict(verify_abst)]}\
 (Level-{model_predict(verify_abst)+1}) ")

This abstract has high probablity to be published in has high probablity of published in MEDIUM Impact Journal (Level-2) 


# Using AutoML to Generate Machine Learning Pipelines with TPOT

In [54]:
# Import dependencies
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
import numpy as np
import pandas as pd 

In [55]:
#Load data
df0=pd.read_csv("../data/group_0.csv")
df1 = pd.read_csv("../data/group_1.csv")
df2=pd.read_csv("../data/group_2.csv")
cobn=[df0,df1,df2]
df=pd.concat(cobn)
# df=df. sample(n=5000, axis=0)
df.label.value_counts()

2    42116
1    18703
0     5801
Name: label, dtype: int64

In [60]:
# split data into test & train
X = df["abstract"]
df.rename(columns={'label': 'class'}, inplace=True)# # rename the target/response variable as class
y = df["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y,train_size=0.75, test_size=0.25)

# transform X and y to lists for processing
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()
len(X_train),len(X_test)

(49965, 16655)

In [61]:
#word to vector
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1, 3))
hash_vectorizer = HashingVectorizer(analyzer='word',stop_words='english', ngram_range=(1, 3),n_features=69000)
X_train=hash_vectorizer.fit_transform(X_train)
X_test=hash_vectorizer.fit_transform(X_test)


In [62]:
df.dtypes

Unnamed: 0     int64
pmid           int64
title         object
abstract      object
journal       object
class          int64
dtype: object

In [63]:
# feature data type convertion to array type
training_feature=X_train.toarray()
test_feature=X_test.toarray()
type(test_feature)

numpy.ndarray

In [None]:
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(training_feature, y_train)
# print(tpot.score(validation_indices, testing_indices))

Optimization Progress:  51%|█████     | 61/120 [2:24:38<3:16:04, 199.40s/pipeline]

Generation 1 - Current best internal CV score: 0.7288120369611495


Optimization Progress:  76%|███████▌  | 91/120 [4:01:55<1:40:29, 207.93s/pipeline]

Generation 2 - Current best internal CV score: 0.7288120369611495


                                                              06, 243.34s/pipeline]

Generation 5 - Current best internal CV score: 0.7288120369611495

Best pipeline: LinearSVC(input_matrix, C=10.0, dual=False, loss=squared_hinge, penalty=l2, tol=0.0001)


TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=5, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=20, periodic_checkpoint_folder=None,
        population_size=20, random_state=None, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

In [61]:
#TPOT accuracy
X_test = X_test.toarray()
y_test=np.array(y_test)
print(tpot.score(X_test, y_test)) #=0.7448
# type(X_test)


0.7448


In [32]:
#export the best algorithm code
tpot.export('../output/tpot-journal-pipeline2.py')

True

## LenearSVC model as suggested by TPOT best pipeline

In [144]:
# Import dependencies
from sklearn.svm import LinearSVC ## LinearSVC is selectd based on the  optimization TPOT best pipeline result
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
import numpy as np
import pandas as pd 
from tpot import TPOTClassifier


In [145]:
# Load dataset
df_0=pd.read_csv("../data/group_0.csv")
df_1 = pd.read_csv("../data/group_1.csv")
df_2=pd.read_csv("../data/group_2.csv")
cobn=[df_0,df_1,df_2]
tPOTdf=pd.concat(cobn)
# df=df. sample(n=1000, axis=0)
tPOTdf.label.value_counts()

2    42116
1    18703
0     5801
Name: label, dtype: int64

In [146]:
# split data into test & train
X = tPOTdf["abstract"]
y = tPOTdf["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y,train_size=0.8, test_size=0.2)

# transform X and y to lists for processing
X_train2 = X_train.tolist()
X_test2 = X_test.tolist()
y_train2 = y_train.tolist()
y_test2 = y_test.tolist()
len(X_train2),len(X_test2)

(53296, 13324)

In [147]:
stop_words=['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than', 'the','study','ref','here','role','to','-PRON-','this','that','background','introduction','method','conclusion','find']


#word to vector
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=stop_words,ngram_range=(1, 3))
hash_vectorizer = HashingVectorizer(analyzer='word',stop_words=stop_words, ngram_range=(1, 3))
feature_train=hash_vectorizer.fit_transform(X_train2)
feature_test=hash_vectorizer.fit_transform(X_test2)

In [226]:
clf3 = LinearSVC(class_weight="balanced", loss='squared_hinge',multi_class='ovr', penalty='l2')#, tol=0.0001
clf3.fit(feature_train, y_train2)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [227]:
# training and testing data score
print(f"Training accuracy: {clf3.score(feature_train, y_train2)}")
print(f"Testing accuracy: {clf3.score(feature_test, y_test2)}")

# Training accuracy: 0.9989867907535275
# Testing accuracy: 0.8146202341639147

Training accuracy: 0.9989867907535275
Testing accuracy: 0.8146202341639147


In [228]:
# Making predictions
predictions = clf3.predict(feature_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test2}).head()

Unnamed: 0,Actual,Prediction
0,2,2
1,2,2
2,1,1
3,2,2
4,1,1


In [229]:
# save model
from sklearn.externals import joblib
joblib.dump(clf3, '../models_trained/linearSVC_model_aws.pkl')

['../models_trained/linearSVC_model_aws.pkl']

In [151]:
# Saving the model
import pickle
# pickle.dump(clf3,open('../models_trained/linearSVC_model_Final', 'wb'))
classifier.classes_
#export the code

array([0, 1, 2])

In [152]:
# Test: loading in the pickled model
clf3 = pickle.load(open('../models_trained/linearSVC_model_Final', 'rb'))

In [225]:
aws s3 cp clf3.pkl s3://www.abstract-significance-prediction.com

SyntaxError: invalid syntax (<ipython-input-225-765f3443ec27>, line 1)

In [153]:
verify_abst="this is a test"#df0.abstract[432]

Journal_class = {0:"has high probablity of published in HIGH Impact Journals", 
                 1:"has high probablity of published in MEDIUM Impact Journal",
                 2:"has NOT  a chance of published in HIGH or MEDIUM Impact Journal"}
def model_predict(s):
    string = []
    string.append(s)
    test = hash_vectorizer.fit_transform(string)
    result = clf3.predict(test)
    return result[0]


print(f"This abstract has high probablity to be published in {Journal_class[model_predict(verify_abst)]}\
 (or it is LEVEL-{model_predict(verify_abst)+1} abstract) ")

This abstract has high probablity to be published in has NOT  a chance of published in HIGH or MEDIUM Impact Journal (or it is LEVEL-3 abstract) 


In [None]:
aws s3 cp model.pkl s3://yourbucketname