In [1]:
### Safe to rerun ###

import config
import gspread
import json
import openai
import os

import numpy as np
import pandas as pd

# Authorize gspread
gc, authorized_user = gspread.oauth_from_dict(config.gspread_secret_key_personal, config.gspread_auth_key_personal)

# Read Input from directory

directory = r"C:\Users\Hooman Deghani\Python\Data Analysis\Outreach - Skyscraper\Output\Current"

## List all CSV files in the directory

urls = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith('.xlsx')]

if len(urls) > 1:
    df = pd.concat([pd.read_excel(url) for url in urls], ignore_index=True)

else: 
    df = pd.read_excel(urls[0])

# Create a variable for the title of the project
for filename in os.listdir(directory):
    if filename.endswith('.xlsx'):
        # Capture the cryptic name
        artifact_name = filename
        
        # Extract the essence
        title = artifact_name.split('.')[0]

# Load records.json into a python dictionary titled records
with open(r"C:\Users\Hooman Deghani\Python\Data Analysis\Outreach - Skyscraper\Records.json", "r") as records_json:
    records = json.load(records_json)

# Check if an entry in records exists with title
if title not in records:
    records[title] = {
        'New': 'True',
        'Last': ''
    }

records

{'fthb': {'New': 'False',
  'Last': 'https://morganton.com/lifestyles/home-and-garden/real-estate/co-buying-a-house-how-platonic-partners-make-it-work/article_a4c1234a-6a88-5819-9ee1-bfb26cf02a6d.html'},
 'wfh': {'New': 'False',
  'Last': 'https://www.thenexthint.com/simple-steps-to-make-working-from-home-work-for-you/11739/'}}

In [2]:
### Safe to rerun ###

# Clean up the data #
# Make df json compliant
df = df.applymap(str)
df.replace("nan", "", inplace=True)
df = df.applymap(str.strip)

# Remove rows where email is empty
Recipient_null = df.loc[:, 'Recipient'] == ""
df = df.loc[~Recipient_null]

# Reset index
df = df.reset_index(drop=True)

In [3]:
### Safe to rerun ###
# Prepare the batch #

# TODO: drop Anchor, DR
df = df.drop(columns=['Anchor', 'Domain rating'])

# Create the dic variable
dic = records.get(title)

# if dataframe is new, batch = df.loc[0:30,:]
if dic.get("New") == "True":
    batch = df.loc[0:30,:]

# if dataframe is old, batch = the next 30 after "Last"
if dic.get("New") == "False":

    # Get the index label of the last email sent
    filt = df.loc[:, 'Referring page URL'] == records.get(title).get("Last")
    last = df.loc[filt].index[0]

    # batch = 30 after last
    batch = df.loc[(last + 1): (last + 31)]

batch

Unnamed: 0,Referring page URL,Referring Topic,Target URL,Root URL,First Name,Last Name,Recipient,Email Sent,Status,Replied,Converted
155,http://www.olimp.turystyka.pl/vza/getting-thin...,,https://blog.hubspot.com/marketing/productivit...,https://olimp.turystyka.pl,Sir/Madam,,olimp@olimp.turystyka.pl,,,,
156,https://geekinsider.com/boost-productivity-whe...,,https://blog.hubspot.com/marketing/productivit...,https://geekinsider.com,Matthew,Harris,matthew@geekinsider.com,,,,
157,https://upcea.edu/home-sweet-home-is-not-offic...,,https://blog.hubspot.com/marketing/productivit...,https://upcea.edu,Jordan,Dimaggio,jdimaggio@upcea.edu,,,,
158,https://blog.naturalwellbeing.com/6-tips-for-n...,,https://blog.hubspot.com/marketing/productivit...,https://naturalwellbeing.com,Sir/Madam,,info@naturalwellbeing.com,,,,
159,https://www.nya.org/about/healthy-mind-motivat...,,https://blog.hubspot.com/marketing/productivit...,https://nya.org,Kevin,Roy,kroy@nya.org,,,,
160,https://blog.fracturedatlas.org/best-resources...,,https://blog.hubspot.com/marketing/productivit...,https://fracturedatlas.org,Sophia,Park,sophia.park@fracturedatlas.org,,,,
161,https://vocal.media/lifehack/tips-on-working-r...,,https://blog.hubspot.com/marketing/productivit...,https://vocal.media,Sir/Madam,,news_politics@vocal.media,,,,
162,https://www.belvoir.co.uk/articles/creating-a-...,,https://blog.hubspot.com/marketing/productivit...,https://belvoir.co.uk,James,Smale,james.smale@belvoir.co.uk,,,,
163,http://m.prodia.co.id/en/Content/ViewContentsD...,,https://blog.hubspot.com/marketing/productivit...,https://prodia.co.id,Sir/Madam,,tegal@prodia.co.id,,,,
164,https://wearethecity.com/working-top-tips-prod...,,https://blog.hubspot.com/marketing/productivit...,https://wearethecity.com,Vanessa,Vallely,vanessa.vallely@wearethecity.com,,,,


In [5]:
### Safe to Rerun ###
#### Needs manual edit ####

# Fill Target Topic & Target Hyperlink

# Create the "Target Topic" column
batch.loc[:, ['Target Topic', 'Target Hyperlink']] = ''


# Get the list of target URLs
print(batch.loc[:, 'Target URL'].value_counts())


# Create filters for each URL
url_1 = "https://blog.hubspot.com/marketing/productivity-tips-working-from-home"
filt_1 = (batch.loc[:, 'Target URL'] == url_1)

url_2 = "https://blog.hubspot.com/marketing/productivity-tips-working-from-home?fbclid=IwAR3_CGB5laJiHk1TxnMiA03h-hXzDwOQD1tLdQdXBzFmaQOS2TnSNwil318"
filt_2 = (batch.loc[:, 'Target URL'] == url_2)

url_3 = "https://blog.hubspot.com/marketing/productivity-tips-working-from-home?__hstc=219294534.b2b47713ef61d04c6d6fede4eb0fd448.1582064984829.1583868321116.1583937752741.59&__hssc=219294534.8.1583937752741&__hsfp=1704468437"
filt_3 = (batch.loc[:, 'Target URL'] == url_3)

# Fill out Target Hyperlink
batch.loc[filt_1, 'Target Hyperlink'] = f"""<a href="{url_1}">the working-from-home guide by Hubspot</a>"""

if filt_2.any():
    batch.loc[filt_2, 'Target Hyperlink'] = f"""<a href="{url_2}">the first-time homebuyer guide by Nerdwallet</a>"""

if filt_3.any():
    batch.loc[filt_3, 'Target Hyperlink'] = f"""<a href="{url_3}">the first-time homebuyer guide by Nerdwallet</a>"""

# Fill out Target Topic: prereferences; e.g. the guide by nerdwallet
batch.loc[filt_1, 'Target Topic'] = "the guide by Hubspot"

if filt_2.any():
    batch.loc[filt_2, 'Target Topic'] = "the guide by Hubspot"

if filt_3.any():
    batch.loc[filt_3, 'Target Topic'] = "the guide by Hubspot"

https://blog.hubspot.com/marketing/productivity-tips-working-from-home                                                                                                                                                      29
https://blog.hubspot.com/marketing/productivity-tips-working-from-home?fbclid=IwAR3_CGB5laJiHk1TxnMiA03h-hXzDwOQD1tLdQdXBzFmaQOS2TnSNwil318                                                                                  1
https://blog.hubspot.com/marketing/productivity-tips-working-from-home?__hstc=219294534.b2b47713ef61d04c6d6fede4eb0fd448.1582064984829.1583868321116.1583937752741.59&__hssc=219294534.8.1583937752741&__hsfp=1704468437     1
Name: Target URL, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch.loc[:, ['Target Topic', 'Target Hyperlink']] = ''


In [6]:
### Safe to rerun ###
# Playground

batch.head()

Unnamed: 0,Referring page URL,Referring Topic,Target URL,Root URL,First Name,Last Name,Recipient,Email Sent,Status,Replied,Converted,Target Topic,Target Hyperlink
155,http://www.olimp.turystyka.pl/vza/getting-thin...,,https://blog.hubspot.com/marketing/productivit...,https://olimp.turystyka.pl,Sir/Madam,,olimp@olimp.turystyka.pl,,,,,the guide by Hubspot,"<a href=""https://blog.hubspot.com/marketing/pr..."
156,https://geekinsider.com/boost-productivity-whe...,,https://blog.hubspot.com/marketing/productivit...,https://geekinsider.com,Matthew,Harris,matthew@geekinsider.com,,,,,the guide by Hubspot,"<a href=""https://blog.hubspot.com/marketing/pr..."
157,https://upcea.edu/home-sweet-home-is-not-offic...,,https://blog.hubspot.com/marketing/productivit...,https://upcea.edu,Jordan,Dimaggio,jdimaggio@upcea.edu,,,,,the guide by Hubspot,"<a href=""https://blog.hubspot.com/marketing/pr..."
158,https://blog.naturalwellbeing.com/6-tips-for-n...,,https://blog.hubspot.com/marketing/productivit...,https://naturalwellbeing.com,Sir/Madam,,info@naturalwellbeing.com,,,,,the guide by Hubspot,"<a href=""https://blog.hubspot.com/marketing/pr..."
159,https://www.nya.org/about/healthy-mind-motivat...,,https://blog.hubspot.com/marketing/productivit...,https://nya.org,Kevin,Roy,kroy@nya.org,,,,,the guide by Hubspot,"<a href=""https://blog.hubspot.com/marketing/pr..."


In [7]:

# # Fill referring Hyperlink

batch.loc[:, 'Referring Hyperlink'] = ""

# Fill referring topic by gpt-3.5
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch.loc[:, 'Referring Hyperlink'] = ""


In [8]:
## Safe to Rerun ##
## Needs manual edit ##

# Fill "SQ1 URL column"
## first possible url: https://www.squareone.ca/resource-centres/home-buying-selling-moving/buying-a-home-for-the-first-time
## second possible url: https://www.squareone.ca/resource-centres/home-personal-safety/work-from-home-guide
batch.loc[:, 'SQ1 URL'] = 'https://www.squareone.ca/resource-centres/home-personal-safety/work-from-home-guide'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch.loc[:, 'SQ1 URL'] = 'https://www.squareone.ca/resource-centres/home-personal-safety/work-from-home-guide'


In [19]:
# # Push to Gspreadsheet
# spreadsheet_title = "Outreach"
# worksheet_title = f"Last:{batch.loc[batch.tail(1).index[0], 'First Name']}{batch.loc[batch.tail(1).index[0], 'Last Name']}"

# spreadsheet = gc.open(spreadsheet_title)
# worksheet = spreadsheet.add_worksheet(title=worksheet_title, rows=batch.shape[0], cols=batch.shape[1])
# worksheet.update([batch.columns.tolist()] + batch.values.tolist())

In [9]:
# Save output to input
file_name = f"{batch.loc[batch.tail(1).index[0], 'Recipient']}.csv"
path = r"C:\Users\Hooman Deghani\Python\Data Analysis\Outreach - Skyscraper\Input\Current\F"

batch.to_csv(path+file_name)

In [10]:
# Update records.json #

# Update records with the new variables
records[title]["New"] = "False"
records[title]["Last"] = batch.loc[batch.tail(1).index[0], 'Referring page URL']

# Push records to records.json
with open(r"C:\Users\Hooman Deghani\Python\Data Analysis\Outreach - Skyscraper\Records.json", "w") as records_json:
    json.dump(records, records_json)