## Scraping Data to Represent Fake User Profiles


In [None]:
import requests
import pandas as pd
import time
import random
import re
import numpy as np
import _pickle as pickle
from tqdm import tqdm_notebook as tqdm
from bs4 import BeautifulSoup as bs

## BeautifulSoup to Navigate

In [None]:
# Randomizing the refresh rate
seq = [i/10 for i in range(8,18)]

# Creating a list of bios
biolist = []

# Gathering bios by looping and refreshing the web page
for _ in tqdm(range(1000)):
    
    # Refreshing the page
    page = requests.get("https://www.fakepersongenerator.com/user-biography-generator")
    soup = bs(page.content)
    
    try:
        # Getting the bios
        bios = soup.find('div', class_='row no-margin for-sign').find_all('p')

        # Adding to a list of the bios
        biolist.extend([re.findall('"([^"]*)"', i.text) for i in bios])
    except:
        pass
    
    # Sleeping 
    time.sleep(random.choice(seq))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
# Creating a DF from the bio list
bio_df = pd.DataFrame(biolist, columns=['Bios'])

In [None]:
# List of potential categories/topics of interest for personality matching
qs = ['Location',
      'Interested in',
      'Age',
      'Movies',
      'Music',
      'Sports',
      'Politics',
      'Social Media']

In [None]:
# Creating a DF of the categories/topics
topic_df = pd.DataFrame(columns=qs)

In [None]:
# Filling in Data
for i in topic_df.columns:
    
    # Range of numbers to represent different labels in each category
    topic_df[i] = np.random.randint(0,10, bio_df.shape[0])
    
    # Logic: The numbers represent a specific choice within the categories
    # So your number 1 preferred artist/song/album under the Music, your one favorite movie, etc.

In [None]:
# Viewing the random data
topic_df

Unnamed: 0,Location,Interested in,Age,Movies,Music,Sports,Politics,Social Media
0,2,4,1,1,0,1,1,1
1,7,9,7,1,8,0,4,4
2,0,2,4,9,1,2,9,8
3,1,3,9,1,8,4,9,6
4,2,2,7,8,8,6,6,1
...,...,...,...,...,...,...,...,...
4795,2,6,7,5,4,0,3,1
4796,2,7,6,9,1,7,6,5
4797,4,7,9,4,5,4,8,5
4798,0,0,7,0,7,7,5,6


In [None]:
# Joining the two dataframes
final_df = bio_df.join(topic_df)
final_df

Unnamed: 0,Bios,Location,Interested in,Age,Movies,Music,Sports,Politics,Social Media
0,Professional coffee practitioner. General entr...,2,4,1,1,0,1,1,1
1,General reader. Incurable introvert. Internet ...,7,9,7,1,8,0,4,4
2,Problem solver. Troublemaker. Extreme reader. ...,0,2,4,9,1,2,9,8
3,Entrepreneur. Communicator. Devoted baconaholi...,1,3,9,1,8,4,9,6
4,Twitter trailblazer. Internet maven. Incurable...,2,2,7,8,8,6,6,1
...,...,...,...,...,...,...,...,...,...
4795,Web trailblazer. Pop culture fanatic. Incurabl...,2,6,7,5,4,0,3,1
4796,Hardcore communicator. Internet scholar. Avid ...,2,7,6,9,1,7,6,5
4797,Travel trailblazer. Wannabe social media fanat...,4,7,9,4,5,4,8,5
4798,Troublemaker. Twitter nerd. Avid beer scholar....,0,0,7,0,7,7,5,6


## Refining the Data
Adding information to categories

In [None]:
new_df = final_df
from scipy.stats import halfnorm

### Creating lists for categories

In [None]:
# Probability dictionary
p = {}

location = ['New England Region',
            'Mid-Atlantic Region',
            'Southern Region',
            'Mid-West Region',
            'South-West Region',
            'Rocky Mountains',
            'Pacific Coastal Region']

p['Location'] = [0.21,
                 0.16,
                 0.14,
                 0.09,
                 0.06,
                 0.04,
                 0.3]

volunteering = ['Animal Welfare',
                'Youth Empowerment',
                'Elder Care',
                'LGBTQ+ Advocacy',
                'Education',
                'Public Health',
                'Environmental Conservation']

p['Interested in'] = [0.20,
                     0.10,
                     0.16,
                     0.14,
                     0.09,
                     0.11,
                     0.2]

movies = ['Adventure',
          'Action',
          'Drama',
          'Comedy',
          'Thriller',
          'Horror',
          'RomCom',
          'Musical',
          'Documentary']

p['Movies'] = [0.28,
               0.21,
               0.16,
               0.14,
               0.09,
               0.06,
               0.04,
               0.01, 
               0.01]

# Music
music = ['Rock',
         'HipHop',
         'Pop',
         'Country',
         'Latin',
         'EDM',
         'Gospel',
         'Jazz',
         'Classical']

p['Music'] = [0.30,
              0.23,
              0.20,
              0.10,
              0.06,
              0.04,
              0.03,
              0.02,
              0.02]

# Sports
sports = ['Football',
          'Baseball',
          'Basketball',
          'Hockey',
          'Soccer',
          'Other']

p['Sports'] = [0.34,
               0.30,
               0.16, 
               0.13,
               0.04,
               0.03]

# Politics 
politics = ['Liberal',
            'Progressive',
            'Centrist',
            'Moderate',
            'Conservative']

p['Politics'] = [0.26,
                 0.11,
                 0.11,
                 0.15,
                 0.37]

# Social Media
social = ['Facebook',
          'Youtube',
          'Twitter',
          'Reddit',
          'Instagram',
          'Pinterest',
          'LinkedIn',
          'SnapChat',
          'TikTok']

p['Social Media'] = [0.36,
                     0.27,
                     0.11,
                     0.09,
                     0.05,
                     0.03,
                     0.03,
                     0.03,
                     0.03]

# Age (generating random numbers based on half normal distribution)
age = halfnorm.rvs(loc=18,scale=8, size=new_df.shape[0]).astype(int)

# Lists of Names and the list of the lists
categories = [location, volunteering, age, movies, music, politics, sports, social]

names = ['Location', 'Interested in', 'Age', 'Movies','Music', 'Politics', 'Sports',  'Social Media']

combined = dict(zip(names, categories))

## Establishing random values for each category

In [None]:
# Looping through and assigning random values
for name, cats in combined.items():
    if name in ['Location', 'Interested in', 'Politics']:
        # Picking only 1 from the list
        new_df[name] = np.random.choice(cats, new_df.shape[0], p=p[name])

    elif name == 'Age':
        # Generating random ages based on a normal distribution
        new_df[name] = cats
    else:
        # Picking 3 from the list 
        try:
            new_df[name] = list(np.random.choice(cats, size=(new_df.shape[0],1,3), p=p[name]))
        except:
            new_df[name] = list(np.random.choice(cats, size=(new_df.shape[0],1,3)))
        
            new_df[name] = new_df[name].apply(lambda x: list(set(x[0].tolist())))

In [None]:
new_df

Unnamed: 0,Bios,Location,Interested in,Age,Movies,Music,Sports,Politics,Social Media
0,Professional coffee practitioner. General entr...,Southern Region,Public Health,22,"[[Action, Drama, Action]]","[[Rock, Latin, Rock]]","[[Soccer, Football, Basketball]]",Centrist,"[[Facebook, Facebook, Facebook]]"
1,General reader. Incurable introvert. Internet ...,Mid-Atlantic Region,LGBTQ+ Advocacy,21,"[[Thriller, Horror, Comedy]]","[[HipHop, Rock, Classical]]","[[Baseball, Other, Basketball]]",Conservative,"[[Youtube, Youtube, Facebook]]"
2,Problem solver. Troublemaker. Extreme reader. ...,New England Region,LGBTQ+ Advocacy,25,"[[Adventure, Adventure, Action]]","[[HipHop, Rock, Pop]]","[[Football, Hockey, Soccer]]",Liberal,"[[Facebook, Twitter, Youtube]]"
3,Entrepreneur. Communicator. Devoted baconaholi...,Southern Region,Youth Empowerment,18,"[[Action, Action, Adventure]]","[[HipHop, Pop, Country]]","[[Football, Baseball, Football]]",Centrist,"[[Facebook, Facebook, Youtube]]"
4,Twitter trailblazer. Internet maven. Incurable...,Pacific Coastal Region,LGBTQ+ Advocacy,25,"[[Action, Action, Drama]]","[[HipHop, Pop, HipHop]]","[[Baseball, Basketball, Baseball]]",Conservative,"[[Facebook, Youtube, Reddit]]"
...,...,...,...,...,...,...,...,...,...
4795,Web trailblazer. Pop culture fanatic. Incurabl...,New England Region,Animal Welfare,21,"[[Action, Action, Adventure]]","[[Pop, Rock, Latin]]","[[Baseball, Hockey, Football]]",Liberal,"[[Youtube, Facebook, Facebook]]"
4796,Hardcore communicator. Internet scholar. Avid ...,Pacific Coastal Region,Elder Care,22,"[[Comedy, Comedy, Adventure]]","[[Rock, Latin, Gospel]]","[[Baseball, Football, Football]]",Conservative,"[[Facebook, Facebook, Reddit]]"
4797,Travel trailblazer. Wannabe social media fanat...,Pacific Coastal Region,Education,25,"[[Adventure, Thriller, Adventure]]","[[Pop, Country, HipHop]]","[[Baseball, Baseball, Baseball]]",Moderate,"[[Youtube, LinkedIn, LinkedIn]]"
4798,Troublemaker. Twitter nerd. Avid beer scholar....,South-West Region,LGBTQ+ Advocacy,30,"[[Action, Action, Thriller]]","[[Latin, Rock, Pop]]","[[Football, Baseball, Baseball]]",Moderate,"[[SnapChat, Facebook, Instagram]]"


##Export data

In [None]:
with open("user_data.pkl",'wb') as fp:
    pickle.dump(new_df, fp)