In [None]:
#Note: The code reads in two csv files 'Train.csv' and 'Test.csv' and concatenates them into a single dataframe 'df'. 
#It then removes the 'id' column and any duplicate rows based on the 'article' and 'highlight' columns. 
#It checks for any missing values and prints out information about the dataframe. 
#Finally, it uses the OpenAI API to generate a summary of the given 'article_text' using the GPT-3 model.

In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import sklearn.metrics as ms
import re
import openai
import nltk

#Downloading necessary NLTK packages
nltk.download('stopwords')
nltk.download('punkt')

#Importing necessary NLTK modules
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
from nltk.corpus import stopwords
ps = PorterStemmer()
from nltk import word_tokenize
import string 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\9941064513.UPS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\9941064513.UPS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#Reading Data
train_df=pd.read_csv("Train.csv")
test_df=pd.read_csv("Test.csv")
df=pd.concat([train_df,test_df])

In [3]:
#Removing 'id' column from the dataframe

df = df.drop(['id'], axis=1)
df = df.reset_index(drop=True)

train_df = train_df.drop(['id'], axis=1)
train_df = train_df.reset_index(drop=True)
test_df = test_df.drop(['id'], axis=1)
test_df = test_df.reset_index(drop=True)

In [4]:
#Removing duplicates from the dataframe

train_df.duplicated(subset= ['article', 'highlights']).sum()
train_df = train_df.drop_duplicates(subset= ['article', 'highlights'])

test_df.duplicated(subset= ['article', 'highlights']).sum()
test_df = test_df.drop_duplicates(subset= ['article', 'highlights'])

df.duplicated(subset= ['article', 'highlights']).sum()
df = df.drop_duplicates(subset= ['article', 'highlights'])


In [5]:
df.dropna(axis=0,inplace=True)

# Checking for any missing values

print(df.isna().sum())
print(df.info())
print(train_df.info())

# Set up the OpenAI API client
openai.api_key = "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"

In [11]:
# Choose the GPT-3 model to use
model_engine = "davinci"

In [12]:
# Load the original news article text

article_text = "By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in Italy last month. Symptoms of hepatitis A include fever, tiredness, loss of appetite, nausea and abdominal discomfort. Fargo Catholic Diocese in North Dakota (pictured) is where the bishop is located ."


In [13]:
# Set the desired length of the summary
summary_length = 100

In [14]:
# Generate the summary using the GPT-3 API
corpus = []

for i in range(0, 128):
    article = re.sub('[^a-zA-Z]', ' ', df['article'][i])
    article = re.sub(r"[^\w\s]", " ", df['article'][i])
    article = article.lower()
    article = article.split()
    ps = PorterStemmer()
    article = [ps.stem(word) for word in article if not word in set(stopwords.words('english'))]
    article = ' '.join(article)
    corpus.append(article)
    completion = openai.Completion.create(engine=model_engine,
                                      prompt=article_text,
                                      max_tokens=summary_length,
                                      n=1,
                                      stop=None,
                                      temperature=0.5)

In [15]:
# Extract the summary text from the API response
summary_text = completion.choices[0].text

In [16]:
# Print the summary text
print(summary_text)

 The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immun
