In [1]:
import numpy as np
import pandas as pd 
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
def _create_frequency_table(text_string) -> dict:

    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable


def _score_sentences(sentences, freqTable) -> dict:
    sentenceValue = dict()

    for sentence in sentences:
        word_count_in_sentence = (len(word_tokenize(sentence)))
        word_count_in_sentence_except_stop_words = 0
        for wordValue in freqTable:
            if wordValue in sentence.lower():
                word_count_in_sentence_except_stop_words += 1
                if sentence[:10] in sentenceValue:
                    sentenceValue[sentence[:10]] += freqTable[wordValue]
                else:
                    sentenceValue[sentence[:10]] = freqTable[wordValue]

        if sentence[:10] in sentenceValue:
            sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] / word_count_in_sentence_except_stop_words
    return sentenceValue


def _find_average_score(sentenceValue) -> int:

    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original text
    average = (sumValues / len(sentenceValue))

    return average


def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:10] in sentenceValue and sentenceValue[sentence[:10]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary


def run_summarization(text):
    # 1 Create the word frequency table
    freq_table = _create_frequency_table(text)

    sentences = sent_tokenize(text)
    sentence_scores = _score_sentences(sentences, freq_table)
    threshold = _find_average_score(sentence_scores)
    summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)

    return summary


In [3]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
test['label']='t'

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer

#data prep
test=test.fillna(' ')
train=train.fillna(' ')
test['total']=test['title']+' '+test['author']+test['text']
train['total']=train['title']+' '+train['author']+train['text']

#tfidf
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['total'].values)
tfidf = transformer.fit_transform(counts)

In [5]:
targets = train['label'].values
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, targets, random_state=0)

  idf = np.log(n_samples / df) + 1


In [6]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
print('Accuracy of Lasso classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))
print('Accuracy of Lasso classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))

Accuracy of Lasso classifier on training set: 1.00
Accuracy of Lasso classifier on test set: 0.98


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [7]:
y_train_pred = logreg.predict_proba(X_train)
print(y_train_pred[:,1], y_train)

[1.00000000e+00 9.99989048e-01 2.46202164e-05 ... 2.53000335e-07
 1.02082068e-08 2.45538298e-05] [1 1 0 ... 0 0 0]


In [8]:
y_test_pred =logreg.predict_proba(X_test)
print(y_test_pred[:,1], y_test)

[0.00108589 0.99996355 0.99999597 ... 0.22990449 0.9999998  0.55225337] [0 1 1 ... 0 1 0]


In [9]:
targets = train['label'].values
logreg = LogisticRegression()
logreg.fit(counts, targets)

example_counts = count_vectorizer.transform(test['total'].values)
predictions = logreg.predict(example_counts)
pred=pd.DataFrame(predictions,columns=['label'])
pred['id']=test['id']
pred.groupby('label').count()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,2603
1,2597


In [10]:
prediction_data = pd.read_csv('/home/gray/Desktop/NeuralNews/Scraper-code/Second.csv')
data_writing = pd.read_csv('/home/gray/Desktop/NeuralNews/Scraper-code/Second.csv')
for i in range(len(prediction_data['Paragraph'])):
    prediction_data['Paragraph'][i] = prediction_data['Paragraph'][i].strip(',;\[]\',\'')
    data_writing['Paragraph'][i] = data_writing['Paragraph'][i].strip(',;\[]\',\'')

In [11]:
data_writing['label'] = 't'
prediction_data = prediction_data.fillna(' ')
data_writing = data_writing.fillna(' ') 
prediction_data['total'] = prediction_data['Title'] + prediction_data['Paragraph']

In [12]:
stopWords = set(stopwords.words("english"))

for i in range(len(prediction_data['total'])):
    li = ""
    words = word_tokenize(prediction_data['total'][i])
    for w in words:
        #print(w,' ')
        if w not in stopWords:
            li = li +' '+ w
    prediction_data['total'][i] = li

In [13]:
pred_counts = count_vectorizer.transform(prediction_data['total'].values)
pred_tfidf = transformer.fit_transform(pred_counts)

  idf = np.log(n_samples / df) + 1


In [22]:
predictions = logreg.predict_proba(pred_counts)


In [15]:
prediction_data['label'] = predictions[:, [1]]*100

In [17]:
data_writing['label'] = predictions[:, [1]]*100

In [18]:
data_writing['Summary'] = 's'
for i in range(len(data_writing['Title'])):
    result = run_summarization(data_writing['Paragraph'][i])
    data_writing['Summary'][i] = result
    print(i, result)

0  ",
1  ', 'Oregon State Police troopers assisted local police at the protest. State police frequently helped police at the protests until early August, when they withdrew. ',
2  ", 'Naidu, who was in the fourth car, escaped unhurt. ',
3  ",
4  Both have taken their toll, said Bauer in an interview. ",
5  ', 'Both the mother and the fetus were under constant observation. ',
6  You\'ll get it in pouches," a peddler told India Today\'s investigative reporter posing as staff of a wealthy businessman planning a drug party. probed the reporter. the reporter investigated."Yes. You\'ll get in the shape of chalks. You\'ll get 50 grams in one go," the peddler said.When India Today inquired about ganja, a peddler was found selling it some yards away. probed the reporter. the reporter asked a peddler there. You\'ll get it in pouches," the peddler offered. asked the reporter. ',
7  ', '“No party. ', '“(It was) the best I can do,” said Leclerc over the team radio. ', '“Valtteri was very very close

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [19]:
data_writing['Summary']

0                                                    ",
1      ', 'Oregon State Police troopers assisted loc...
2      ", 'Naidu, who was in the fourth car, escaped...
3                                                    ",
4      Both have taken their toll, said Bauer in an ...
5      ', 'Both the mother and the fetus were under ...
6      You\'ll get it in pouches," a peddler told In...
7      ', '“No party. ', '“(It was) the best I can d...
8      ", '"I’m sure he’s going to be climbing up th...
9                                                    ",
10     ', '"The institute shall remain closed for on...
11     Action will be taken action against them, he ...
Name: Summary, dtype: object

In [20]:
data_writing.to_csv('second_dataset.csv')

In [21]:
data_writing.to_json('second_dataset.json')