In [1]:
# import libaries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Data Science Process

1. Define the problem.
2. Obtain the data.
3. **Explore the data.**
4. Model the data.
5. Evaluate the model.
6. Answer the problem.

In [2]:
# Import data sets and merge them
canon1 = pd.read_csv('./data/canon1.csv')
canon2 = pd.read_csv('./data/canon2.csv')
nikon1 = pd.read_csv('./data/nikon1.csv')
nikon2 = pd.read_csv('./data/nikon2.csv')

# The Sony subreddit was scraped for back up data and future expansion of project.
sony1 = pd.read_csv('./data/sony1.csv')
sony2 = pd.read_csv('./data/sony2.csv')

# Merging by camera brand
canon = canon1.append(canon2, ignore_index=True)
nikon = nikon1.append(nikon2, ignore_index=True)
sony = sony1.append(sony2, ignore_index=True)

photo = canon.append(nikon, ignore_index=True)
#photo = photo.append(sony, ignore_index=True)

In [3]:
# Get a sense of the data
photo = photo.drop(photo.columns.difference(['selftext', 'subreddit', 'title']), axis=1)
photo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   selftext   1047 non-null   object
 1   subreddit  2000 non-null   object
 2   title      2000 non-null   object
dtypes: object(3)
memory usage: 47.0+ KB


In [4]:
# Remove unneccessary columns and records
photo = photo.fillna('NA') # instead of removing nulls marking them with NA
# Canon has aggressive mods so a vast majority of posts were removed.  If I removed them there would parely be any data left.
photo['selftext'] = np.where(photo['selftext'] == '[removed]', 'NA', photo['selftext'])  

photo.head(10)

Unnamed: 0,selftext,subreddit,title
0,,canon,"From a shoot last summer on the XC15, call me ..."
1,So even though my Cannon MG7720 printer says i...,canon,Unable To Get Cannon 7720 Connected Wifi
2,,canon,Cannot Attach Sigma 16mm 1.4 to Canon M50
3,I’m wondering if anyone can point out a reliab...,canon,Canon 77D power
4,I'm trying to use the Canon EOS RP for a live ...,canon,Canon EOS RP clean HDMI out?
5,,canon,What lens is being used at the 4:40 mark in th...
6,,canon,Rebel T6
7,"Hello, I hope you are all doing well. I took s...",canon,Canon rebel T7i Question
8,,canon,Question: RF 24-70 vs RF 24-105 for video? Wor...
9,,canon,Shutter Count (Canon 80D)


In [5]:
# Create features
sum(photo.isnull().sum())

0

In [6]:
#Combine text for larger analysis
photo.insert(3, 'combo', photo['selftext'] + ' ' + photo['title'])
photo['subreddit'] = photo['subreddit'].map({'canon':1, 'Nikon':0})
photo.head()

Unnamed: 0,selftext,subreddit,title,combo
0,,1,"From a shoot last summer on the XC15, call me ...","NA From a shoot last summer on the XC15, call ..."
1,So even though my Cannon MG7720 printer says i...,1,Unable To Get Cannon 7720 Connected Wifi,So even though my Cannon MG7720 printer says i...
2,,1,Cannot Attach Sigma 16mm 1.4 to Canon M50,NA Cannot Attach Sigma 16mm 1.4 to Canon M50
3,I’m wondering if anyone can point out a reliab...,1,Canon 77D power,I’m wondering if anyone can point out a reliab...
4,I'm trying to use the Canon EOS RP for a live ...,1,Canon EOS RP clean HDMI out?,I'm trying to use the Canon EOS RP for a live ...


In [7]:
# Save the datasets
canon.to_csv('./data/canon_merge.csv')
nikon.to_csv('./data/nikon_merge.csv')
sony.to_csv('./data/sony_merge.csv')
photo.to_csv('./data/photo.csv')