In [9]:
import numpy as np
import pandas as pd
from faker import Faker
import datetime

In [3]:
class CustomerDataGeneration:
    def __init__(self,faker_locale,seed,count_to_generate):
        self.faker_seed = seed
        self.numpy_seed = seed
        self.count_to_generate = count_to_generate
        self.faker_locale = faker_locale
        self.fake = Faker(self.faker_locale)
        
        # Data block
        self.names = []
        self.first_name = []
        self.last_name = []
        self.date_of_birth = [] #   20 < date < 82 years
        self.gender = []
        self.email = []
        self.phone_number = []
        self.agree_for_promo = []
        self.autopay_card = []
        self.customer_category = []
        self.language = []
        self.customer_since = []
        self.region = []
        self.status = []
        self.customer_data_frame = pd.DataFrame(columns = ['ID',
                                                           'first_name',
                                                           'last_name',
                                                           'date_of_birth',
                                                           'gender',
                                                           'email',
                                                           'MSISDN',
                                                           'agree_for_promo',
                                                           'autopay_card',
                                                          'customer_category',
                                                          'language',
                                                          'customer_since',
                                                          'region',
                                                          'status'])
    def generate_names(self):
        self.names = []   
        for _ in range(self.count_to_generate):
            self.names.append(self.fake.name())
        self._split_names()
    
    def generate_gender(self, prob_M = 0.487, prob_F = 0.513):
        gender_list = self._dist_data_gen(gen_mask = ['M','F'],probs = [prob_M, prob_F])
        self.gender = list(gender_list)
             
    def generate_email(self):
        self.email = []   
        for _ in range(self.count_to_generate):
            self.email.append(self.fake.bothify(text='?*******#')+"@"+self.fake.free_email_domain())
            
    def generate_customer_category(self, prob_business = 0.02, prob_physical = 0.98):
        self.customer_category = []
        self.customer_category = self._dist_data_gen(gen_mask = ['business','physical'],probs = [prob_business, prob_physical])
        
    def generate_agree_for_promo(self, prob_Y = 0.33, prob_N = 0.67):
        self.agree_for_promo =[]
        self.agree_for_promo = list(self._dist_data_gen(gen_mask = ['Yes', 'No'], probs = [prob_Y, prob_N]))
    
    def generate_autopay_card(self,prob_Y=0.6365):
        self.autopay_card = []
        self.autopay_card = list(self._dist_data_gen(gen_mask = ['Yes', 'No'], probs = [prob_Y, 1-prob_Y]))
    
    def generate_birth_date(self):
        self.date_of_birth = []   
        for _ in range(self.count_to_generate):
            self.date_of_birth.append(self.fake.date_between(start_date='-82y', end_date='-15y'))
    
    # Можно добавить логику
    def generate_customer_since(self):
        # since 2000
        self.customer_since = []
        generations = np.random.choice(["3g","lte","5g"], size = self.count_to_generate, p = [0.2,0.6,0.2])
       
        for gen in generations:
            if gen == "3g":
                self.customer_since.append(self.fake.date_between(start_date=datetime.date(2000,1,1), end_date=datetime.date(2010,1,1)))
            elif gen =="lte":
                self.customer_since.append(self.fake.date_between(start_date=datetime.date(2010,1,1), end_date=datetime.date(2020,1,1)))
            elif gen =="5g":
                self.customer_since.append(self.fake.date_between(start_date=datetime.date(2020,1,1), end_date=datetime.date.today()))
            else:
                self.self.customer_since.append(self.fake.date_between(start_date=datetime.date(2000,1,1), end_date=datetime.date.today()))
    
    def generate_region(self):
        regions = ['Hokkaidō', 'Tōhoku', 'Kantō', 'Chūbu', 'Kansai(Kinki)', 'Chūgoku',
       'Shikoku', 'Kyūshū & Okinawa']
        percents_of_region = [0.04213037, 0.06995231, 0.34340223, 0.16931638, 0.17806041,
       0.0572337 , 0.02941176, 0.11049285]
        self.region = []
        self.region = list(self._dist_data_gen(gen_mask = regions, probs = percents_of_region))
        
    def generate_status(self,prob_A = 1.0, prob_I=0.0):
        self.status = []
        self. status = list(self._dist_data_gen(gen_mask = ["Active","Inactive"], probs = [prob_A, prob_I]))
        
    def generate_phone_number(self):
        self.phone_number = []
        for _ in range(self.count_to_generate):
            self.phone_number.append(self.fake.numerify(text='90-####-####'))
    
    def generate_language(self,prob_jap = 0.986, prob_ch = 0.009,prob_kor = 0.005):
        self.language = []
        self.language = list(self._dist_data_gen(gen_mask = ['Japanese', 'Chinese','Korean'], probs = [prob_jap, prob_ch,prob_kor]))
    
    def _split_names(self):
        self.first_name = []
        self.last_name = []
        for i in range(len(self.names)):
            self.first_name.append(self.names[i].split(' ')[0])
            self.last_name.append(self.names[i].split(' ')[1])
    
    def create_data_frame(self):
        self.customer_data_frame['ID'] = np.arange(0,self.count_to_generate)
        self.customer_data_frame['first_name'] = self.first_name
        self.customer_data_frame['last_name'] = self.last_name
        self.customer_data_frame['date_of_birth'] = self.date_of_birth
        self.customer_data_frame['gender'] = self.gender
        self.customer_data_frame['email'] = self.email
        self.customer_data_frame['MSISDN'] = self.phone_number
        self.customer_data_frame['agree_for_promo'] = self.agree_for_promo
        self.customer_data_frame['autopay_card'] = self.autopay_card
        self.customer_data_frame['customer_category'] = self.customer_category
        self.customer_data_frame['language'] = self.language
        self.customer_data_frame['customer_since'] = self.customer_since
        self.customer_data_frame['region'] = self.region
        self.customer_data_frame['status'] = self.status
        
        return self.customer_data_frame
                                                           
    def generate_all_data(self):
        self.generate_names()
        self.generate_birth_date()
        self.generate_gender()
        self.generate_email()
        self.generate_phone_number()
        self.generate_agree_for_promo()
        self.generate_autopay_card()
        self.generate_customer_category()
        self.generate_language()
        self.generate_customer_since()
        self.generate_region()
        self.generate_status()
        
    def _dist_data_gen(self,gen_mask=[0,1],probs=[0.5,0.5]):
        return np.random.choice(gen_mask, size=self.count_to_generate, p=probs)
         
    def customer_data_print(self):
        print("ID:",np.arange(0,self.count_to_generate),
            "\nFirst Names:",self.first_name,
              "\nLast Names:",self.last_name,
              "\nFull Names:",self.names,
              "\nDate of Birth:",self.date_of_birth,
             "\nGender:",self.gender,
             "\nEmail:",self.email,
             "\nMSISDN",self.phone_number,
             "\nagree_for_promo",self.agree_for_promo,
             "\nautopay_card",self.autopay_card,
             "\ncustomer_category",self.customer_category,
             "\nlanguage",self.language,
             "\ncustomer_since",self.customer_since,
             "\nregion",self.region,
             "\nStatus",self.status)
    
    def save_to_csv(self,file_name="Customer.csv"):
        self.customer_data_frame.to_csv(file_name,index=False)

In [11]:
cdg = CustomerDataGeneration('ja_JP',0,10000)

In [12]:
%%time
cdg.generate_all_data()
df = cdg.create_data_frame()

In [None]:
cdg.customer_data_print()

In [13]:
cdg.save_to_csv()

First Names: ['中村', '藤田', '吉田', '加藤', '加藤'] 
Last Names: ['知実', 'くみ子', '七夏', '和也', '香織'] 
Full Names: ['中村 知実', '藤田 くみ子', '吉田 七夏', '加藤 和也', '加藤 香織'] 
Date of Birth: [datetime.date(1977, 7, 28), datetime.date(1989, 9, 14), datetime.date(1996, 8, 31), datetime.date(1954, 10, 26), datetime.date(1948, 1, 19)] 
Gender: ['F', 'M', 'F', 'M', 'F'] 
Email: ['yosukesakamoto@yahoo.com', 'fyoshida@hotmail.com', 'takumaogawa@hotmail.com', 'fnakamura@yahoo.com', 'maisasaki@gmail.com']
